| Conditions | 18 |
| Total Lines | 114 |
| Code Lines | 53 |
| Lines | 0 |
| Ratio | 0 % |
| Tests | 47 |
| CRAP Score | 18 |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like abydos.distance._baystat.sim_baystat() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | # -*- coding: utf-8 -*- |
||
| 29 | 1 | def sim_baystat(src, tar, min_ss_len=None, left_ext=None, right_ext=None): |
|
|
|
|||
| 30 | """Return the Baystat similarity. |
||
| 31 | |||
| 32 | Good results for shorter words are reported when setting min_ss_len to 1 |
||
| 33 | and either left_ext OR right_ext to 1. |
||
| 34 | |||
| 35 | The Baystat similarity is defined in :cite:`Furnohr:2002`. |
||
| 36 | |||
| 37 | This is ostensibly a port of the R module PPRL's implementation: |
||
| 38 | https://github.com/cran/PPRL/blob/master/src/MTB_Baystat.cpp |
||
| 39 | :cite:`Rukasz:2018`. As such, this could be made more pythonic. |
||
| 40 | |||
| 41 | :param str src: source string for comparison |
||
| 42 | :param str tar: target string for comparison |
||
| 43 | :param int min_ss_len: minimum substring length to be considered |
||
| 44 | :param int left_ext: left-side extension length |
||
| 45 | :param int right_ext: right-side extension length |
||
| 46 | :returns: the Baystat similarity |
||
| 47 | :rtype: float |
||
| 48 | |||
| 49 | >>> round(sim_baystat('cat', 'hat'), 12) |
||
| 50 | 0.666666666667 |
||
| 51 | >>> sim_baystat('Niall', 'Neil') |
||
| 52 | 0.4 |
||
| 53 | >>> round(sim_baystat('Colin', 'Cuilen'), 12) |
||
| 54 | 0.166666666667 |
||
| 55 | >>> sim_baystat('ATCG', 'TAGC') |
||
| 56 | 0.0 |
||
| 57 | """ |
||
| 58 | 1 | if src == tar: |
|
| 59 | 1 | return 1 |
|
| 60 | 1 | if not src or not tar: |
|
| 61 | 1 | return 0 |
|
| 62 | |||
| 63 | 1 | max_len = max(len(src), len(tar)) |
|
| 64 | |||
| 65 | 1 | if not (min_ss_len and left_ext and right_ext): |
|
| 66 | # These can be set via arguments to the function. Otherwise they are |
||
| 67 | # set automatically based on values from the article. |
||
| 68 | 1 | if max_len >= 7: |
|
| 69 | 1 | min_ss_len = 2 |
|
| 70 | 1 | left_ext = 2 |
|
| 71 | 1 | right_ext = 2 |
|
| 72 | else: |
||
| 73 | # The paper suggests that for short names, (exclusively) one or the |
||
| 74 | # other of left_ext and right_ext can be 1, with good results. |
||
| 75 | # I use 0 & 0 as the default in this case. |
||
| 76 | 1 | min_ss_len = 1 |
|
| 77 | 1 | left_ext = 0 |
|
| 78 | 1 | right_ext = 0 |
|
| 79 | |||
| 80 | 1 | pos = 0 |
|
| 81 | 1 | match_len = 0 |
|
| 82 | |||
| 83 | 1 | while True: |
|
| 84 | 1 | if pos + min_ss_len > len(src): |
|
| 85 | 1 | return match_len / max_len |
|
| 86 | |||
| 87 | 1 | hit_len = 0 |
|
| 88 | 1 | ix = 1 |
|
| 89 | |||
| 90 | 1 | substring = src[pos : pos + min_ss_len] |
|
| 91 | 1 | search_begin = pos - left_ext |
|
| 92 | |||
| 93 | 1 | if search_begin < 0: |
|
| 94 | 1 | search_begin = 0 |
|
| 95 | 1 | left_ext_len = pos |
|
| 96 | else: |
||
| 97 | 1 | left_ext_len = left_ext |
|
| 98 | |||
| 99 | 1 | if pos + min_ss_len + right_ext >= len(tar): |
|
| 100 | 1 | right_ext_len = len(tar) - pos - min_ss_len |
|
| 101 | else: |
||
| 102 | 1 | right_ext_len = right_ext |
|
| 103 | |||
| 104 | 1 | if ( |
|
| 105 | search_begin + left_ext_len + min_ss_len + right_ext_len |
||
| 106 | > search_begin |
||
| 107 | ): |
||
| 108 | 1 | search_val = tar[ |
|
| 109 | search_begin : ( |
||
| 110 | search_begin + left_ext_len + min_ss_len + right_ext_len |
||
| 111 | ) |
||
| 112 | ] |
||
| 113 | else: |
||
| 114 | 1 | search_val = '' |
|
| 115 | |||
| 116 | 1 | flagged_tar = '' |
|
| 117 | 1 | while substring in search_val and pos + ix <= len(src): |
|
| 118 | 1 | hit_len = len(substring) |
|
| 119 | 1 | flagged_tar = tar.replace(substring, '#' * hit_len) |
|
| 120 | |||
| 121 | 1 | if pos + min_ss_len + ix <= len(src): |
|
| 122 | 1 | substring = src[pos : pos + min_ss_len + ix] |
|
| 123 | |||
| 124 | 1 | if pos + min_ss_len + right_ext_len + 1 <= len(tar): |
|
| 125 | 1 | right_ext_len += 1 |
|
| 126 | |||
| 127 | # The following is unnecessary, I think |
||
| 128 | # if (search_begin + left_ext_len + min_ss_len + right_ext_len <= |
||
| 129 | # len(tar)): |
||
| 130 | 1 | search_val = tar[ |
|
| 131 | search_begin : ( |
||
| 132 | search_begin + left_ext_len + min_ss_len + right_ext_len |
||
| 133 | ) |
||
| 134 | ] |
||
| 135 | |||
| 136 | 1 | ix += 1 |
|
| 137 | |||
| 138 | 1 | if hit_len > 0: |
|
| 139 | 1 | tar = flagged_tar |
|
| 140 | |||
| 141 | 1 | match_len += hit_len |
|
| 142 | 1 | pos += ix |
|
| 143 | |||
| 175 |