| Total Complexity | 157 |
| Total Lines | 809 |
| Duplicated Lines | 4.2 % |
| Coverage | 99.61% |
| Changes | 0 | ||
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like abydos.distance._synoname often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | # -*- coding: utf-8 -*- |
||
| 2 | |||
| 3 | # Copyright 2018 by Christopher C. Little. |
||
| 4 | # This file is part of Abydos. |
||
| 5 | # |
||
| 6 | # Abydos is free software: you can redistribute it and/or modify |
||
| 7 | # it under the terms of the GNU General Public License as published by |
||
| 8 | # the Free Software Foundation, either version 3 of the License, or |
||
| 9 | # (at your option) any later version. |
||
| 10 | # |
||
| 11 | # Abydos is distributed in the hope that it will be useful, |
||
| 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
| 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
| 14 | # GNU General Public License for more details. |
||
| 15 | # |
||
| 16 | # You should have received a copy of the GNU General Public License |
||
| 17 | # along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
||
| 18 | |||
| 19 | 1 | """abydos.distance._synoname. |
|
| 20 | |||
| 21 | Synoname. |
||
| 22 | """ |
||
| 23 | |||
| 24 | 1 | from __future__ import ( |
|
| 25 | absolute_import, |
||
| 26 | division, |
||
| 27 | print_function, |
||
| 28 | unicode_literals, |
||
| 29 | ) |
||
| 30 | |||
| 31 | 1 | from collections import Iterable |
|
| 32 | |||
| 33 | 1 | from ._distance import _Distance |
|
| 34 | 1 | from ._levenshtein import levenshtein |
|
| 35 | 1 | from ._ratcliff_obershelp import sim_ratcliff_obershelp |
|
| 36 | |||
| 37 | # noinspection PyProtectedMember |
||
| 38 | 1 | from ..fingerprint._synoname import SynonameToolcode |
|
| 39 | |||
| 40 | 1 | __all__ = ['Synoname', 'synoname'] |
|
| 41 | |||
| 42 | |||
| 43 | 1 | class Synoname(_Distance): |
|
|
|
|||
| 44 | """Synoname. |
||
| 45 | |||
| 46 | Cf. :cite:`Getty:1991,Gross:1991` |
||
| 47 | """ |
||
| 48 | |||
| 49 | 1 | _stc = SynonameToolcode() |
|
| 50 | |||
| 51 | 1 | _test_dict = { |
|
| 52 | val: 2 ** n |
||
| 53 | for n, val in enumerate( |
||
| 54 | ( |
||
| 55 | 'exact', |
||
| 56 | 'omission', |
||
| 57 | 'substitution', |
||
| 58 | 'transposition', |
||
| 59 | 'punctuation', |
||
| 60 | 'initials', |
||
| 61 | 'extension', |
||
| 62 | 'inclusion', |
||
| 63 | 'no_first', |
||
| 64 | 'word_approx', |
||
| 65 | 'confusions', |
||
| 66 | 'char_approx', |
||
| 67 | ) |
||
| 68 | ) |
||
| 69 | } |
||
| 70 | 1 | _match_name = ( |
|
| 71 | '', |
||
| 72 | 'exact', |
||
| 73 | 'omission', |
||
| 74 | 'substitution', |
||
| 75 | 'transposition', |
||
| 76 | 'punctuation', |
||
| 77 | 'initials', |
||
| 78 | 'extension', |
||
| 79 | 'inclusion', |
||
| 80 | 'no_first', |
||
| 81 | 'word_approx', |
||
| 82 | 'confusions', |
||
| 83 | 'char_approx', |
||
| 84 | 'no_match', |
||
| 85 | ) |
||
| 86 | 1 | _match_type_dict = {val: n for n, val in enumerate(_match_name)} |
|
| 87 | |||
| 88 | 1 | def _synoname_strip_punct(self, word): |
|
| 89 | """Return a word with punctuation stripped out. |
||
| 90 | |||
| 91 | Parameters |
||
| 92 | ---------- |
||
| 93 | word : str |
||
| 94 | A word to strip punctuation from |
||
| 95 | |||
| 96 | Returns |
||
| 97 | ------- |
||
| 98 | str |
||
| 99 | The word stripped of punctuation |
||
| 100 | |||
| 101 | Examples |
||
| 102 | -------- |
||
| 103 | >>> pe = Synoname() |
||
| 104 | >>> pe._synoname_strip_punct('AB;CD EF-GH$IJ') |
||
| 105 | 'ABCD EFGHIJ' |
||
| 106 | |||
| 107 | """ |
||
| 108 | 1 | stripped = '' |
|
| 109 | 1 | for char in word: |
|
| 110 | 1 | if char not in set(',-./:;"&\'()!{|}?$%*+<=>[\\]^_`~'): |
|
| 111 | 1 | stripped += char |
|
| 112 | 1 | return stripped.strip() |
|
| 113 | |||
| 114 | 1 | def _synoname_word_approximation( |
|
| 115 | self, src_ln, tar_ln, src_fn='', tar_fn='', features=None |
||
| 116 | ): |
||
| 117 | """Return the Synoname word approximation score for two names. |
||
| 118 | |||
| 119 | Parameters |
||
| 120 | ---------- |
||
| 121 | src_ln : str |
||
| 122 | Last name of the source |
||
| 123 | tar_ln : str |
||
| 124 | Last name of the target |
||
| 125 | src_fn : str |
||
| 126 | First name of the source (optional) |
||
| 127 | tar_fn : str |
||
| 128 | First name of the target (optional) |
||
| 129 | features : dict |
||
| 130 | A dict containing special features calculated using |
||
| 131 | :py:class:`fingerprint.SynonameToolcode` (optional) |
||
| 132 | |||
| 133 | Returns |
||
| 134 | ------- |
||
| 135 | float |
||
| 136 | The word approximation score |
||
| 137 | |||
| 138 | Examples |
||
| 139 | -------- |
||
| 140 | >>> pe = Synoname() |
||
| 141 | >>> pe._synoname_word_approximation('Smith Waterman', 'Waterman', |
||
| 142 | ... 'Tom Joe Bob', 'Tom Joe') |
||
| 143 | 0.6 |
||
| 144 | |||
| 145 | """ |
||
| 146 | 1 | if features is None: |
|
| 147 | 1 | features = {} |
|
| 148 | 1 | if 'src_specials' not in features: |
|
| 149 | 1 | features['src_specials'] = [] |
|
| 150 | 1 | if 'tar_specials' not in features: |
|
| 151 | 1 | features['tar_specials'] = [] |
|
| 152 | |||
| 153 | 1 | src_len_specials = len(features['src_specials']) |
|
| 154 | 1 | tar_len_specials = len(features['tar_specials']) |
|
| 155 | |||
| 156 | # 1 |
||
| 157 | 1 | if ('gen_conflict' in features and features['gen_conflict']) or ( |
|
| 158 | 'roman_conflict' in features and features['roman_conflict'] |
||
| 159 | ): |
||
| 160 | 1 | return 0 |
|
| 161 | |||
| 162 | # 3 & 7 |
||
| 163 | 1 | full_tar1 = ' '.join((tar_ln, tar_fn)).replace('-', ' ').strip() |
|
| 164 | 1 | for s_pos, s_type in features['tar_specials']: |
|
| 165 | 1 | if s_type == 'a': |
|
| 166 | 1 | full_tar1 = full_tar1[ |
|
| 167 | : -( |
||
| 168 | 1 |
||
| 169 | + len( |
||
| 170 | self._stc._synoname_special_table[ # noqa: SF01 |
||
| 171 | s_pos |
||
| 172 | ][1] |
||
| 173 | ) |
||
| 174 | ) |
||
| 175 | ] |
||
| 176 | 1 | elif s_type == 'b': |
|
| 177 | 1 | loc = ( |
|
| 178 | full_tar1.find( |
||
| 179 | ' ' |
||
| 180 | + self._stc._synoname_special_table[ # noqa: SF01 |
||
| 181 | s_pos |
||
| 182 | ][1] |
||
| 183 | + ' ' |
||
| 184 | ) |
||
| 185 | + 1 |
||
| 186 | ) |
||
| 187 | 1 | full_tar1 = ( |
|
| 188 | full_tar1[:loc] |
||
| 189 | + full_tar1[ |
||
| 190 | loc |
||
| 191 | + len( |
||
| 192 | self._stc._synoname_special_table[ # noqa: SF01 |
||
| 193 | s_pos |
||
| 194 | ][1] |
||
| 195 | ) : |
||
| 196 | ] |
||
| 197 | ) |
||
| 198 | 1 | elif s_type == 'c': |
|
| 199 | 1 | full_tar1 = full_tar1[ |
|
| 200 | 1 |
||
| 201 | + len( |
||
| 202 | self._stc._synoname_special_table[s_pos][ # noqa: SF01 |
||
| 203 | 1 |
||
| 204 | ] |
||
| 205 | ) : |
||
| 206 | ] |
||
| 207 | |||
| 208 | 1 | full_src1 = ' '.join((src_ln, src_fn)).replace('-', ' ').strip() |
|
| 209 | 1 | for s_pos, s_type in features['src_specials']: |
|
| 210 | 1 | if s_type == 'a': |
|
| 211 | 1 | full_src1 = full_src1[ |
|
| 212 | : -( |
||
| 213 | 1 |
||
| 214 | + len( |
||
| 215 | self._stc._synoname_special_table[ # noqa: SF01 |
||
| 216 | s_pos |
||
| 217 | ][1] |
||
| 218 | ) |
||
| 219 | ) |
||
| 220 | ] |
||
| 221 | 1 | elif s_type == 'b': |
|
| 222 | 1 | loc = ( |
|
| 223 | full_src1.find( |
||
| 224 | ' ' |
||
| 225 | + self._stc._synoname_special_table[ # noqa: SF01 |
||
| 226 | s_pos |
||
| 227 | ][1] |
||
| 228 | + ' ' |
||
| 229 | ) |
||
| 230 | + 1 |
||
| 231 | ) |
||
| 232 | 1 | full_src1 = ( |
|
| 233 | full_src1[:loc] |
||
| 234 | + full_src1[ |
||
| 235 | loc |
||
| 236 | + len( |
||
| 237 | self._stc._synoname_special_table[ # noqa: SF01 |
||
| 238 | s_pos |
||
| 239 | ][1] |
||
| 240 | ) : |
||
| 241 | ] |
||
| 242 | ) |
||
| 243 | 1 | elif s_type == 'c': |
|
| 244 | 1 | full_src1 = full_src1[ |
|
| 245 | 1 |
||
| 246 | + len( |
||
| 247 | self._stc._synoname_special_table[s_pos][ # noqa: SF01 |
||
| 248 | 1 |
||
| 249 | ] |
||
| 250 | ) : |
||
| 251 | ] |
||
| 252 | |||
| 253 | 1 | full_tar2 = full_tar1 |
|
| 254 | 1 | for s_pos, s_type in features['tar_specials']: |
|
| 255 | 1 | if s_type == 'd': |
|
| 256 | 1 | full_tar2 = full_tar2[ |
|
| 257 | len( |
||
| 258 | self._stc._synoname_special_table[s_pos][ # noqa: SF01 |
||
| 259 | 1 |
||
| 260 | ] |
||
| 261 | ) : |
||
| 262 | ] |
||
| 263 | 1 | elif ( |
|
| 264 | s_type == 'X' |
||
| 265 | and self._stc._synoname_special_table[s_pos][1] # noqa: SF01 |
||
| 266 | in full_tar2 |
||
| 267 | ): |
||
| 268 | 1 | loc = full_tar2.find( |
|
| 269 | ' ' |
||
| 270 | + self._stc._synoname_special_table[s_pos][1] # noqa: SF01 |
||
| 271 | ) |
||
| 272 | 1 | full_tar2 = ( |
|
| 273 | full_tar2[:loc] |
||
| 274 | + full_tar2[ |
||
| 275 | loc |
||
| 276 | + len( |
||
| 277 | self._stc._synoname_special_table[ # noqa: SF01 |
||
| 278 | s_pos |
||
| 279 | ][1] |
||
| 280 | ) : |
||
| 281 | ] |
||
| 282 | ) |
||
| 283 | |||
| 284 | 1 | full_src2 = full_src1 |
|
| 285 | 1 | for s_pos, s_type in features['src_specials']: |
|
| 286 | 1 | if s_type == 'd': |
|
| 287 | 1 | full_src2 = full_src2[ |
|
| 288 | len( |
||
| 289 | self._stc._synoname_special_table[s_pos][ # noqa: SF01 |
||
| 290 | 1 |
||
| 291 | ] |
||
| 292 | ) : |
||
| 293 | ] |
||
| 294 | 1 | elif ( |
|
| 295 | s_type == 'X' |
||
| 296 | and self._stc._synoname_special_table[s_pos][1] # noqa: SF01 |
||
| 297 | in full_src2 |
||
| 298 | ): |
||
| 299 | 1 | loc = full_src2.find( |
|
| 300 | ' ' |
||
| 301 | + self._stc._synoname_special_table[s_pos][1] # noqa: SF01 |
||
| 302 | ) |
||
| 303 | 1 | full_src2 = ( |
|
| 304 | full_src2[:loc] |
||
| 305 | + full_src2[ |
||
| 306 | loc |
||
| 307 | + len( |
||
| 308 | self._stc._synoname_special_table[ # noqa: SF01 |
||
| 309 | s_pos |
||
| 310 | ][1] |
||
| 311 | ) : |
||
| 312 | ] |
||
| 313 | ) |
||
| 314 | |||
| 315 | 1 | full_tar1 = self._synoname_strip_punct(full_tar1) |
|
| 316 | 1 | tar1_words = full_tar1.split() |
|
| 317 | 1 | tar1_num_words = len(tar1_words) |
|
| 318 | |||
| 319 | 1 | full_src1 = self._synoname_strip_punct(full_src1) |
|
| 320 | 1 | src1_words = full_src1.split() |
|
| 321 | 1 | src1_num_words = len(src1_words) |
|
| 322 | |||
| 323 | 1 | full_tar2 = self._synoname_strip_punct(full_tar2) |
|
| 324 | 1 | tar2_words = full_tar2.split() |
|
| 325 | 1 | tar2_num_words = len(tar2_words) |
|
| 326 | |||
| 327 | 1 | full_src2 = self._synoname_strip_punct(full_src2) |
|
| 328 | 1 | src2_words = full_src2.split() |
|
| 329 | 1 | src2_num_words = len(src2_words) |
|
| 330 | |||
| 331 | # 2 |
||
| 332 | 1 | if ( |
|
| 333 | src1_num_words < 2 |
||
| 334 | and src_len_specials == 0 |
||
| 335 | and src2_num_words < 2 |
||
| 336 | and tar_len_specials == 0 |
||
| 337 | ): |
||
| 338 | 1 | return 0 |
|
| 339 | |||
| 340 | # 4 |
||
| 341 | 1 | if ( |
|
| 342 | tar1_num_words == 1 |
||
| 343 | and src1_num_words == 1 |
||
| 344 | and tar1_words[0] == src1_words[0] |
||
| 345 | ): |
||
| 346 | 1 | return 1 |
|
| 347 | 1 | if tar1_num_words < 2 and tar_len_specials == 0: |
|
| 348 | 1 | return 0 |
|
| 349 | |||
| 350 | # 5 |
||
| 351 | 1 | last_found = False |
|
| 352 | 1 | for word in tar1_words: |
|
| 353 | 1 | if src_ln.endswith(word) or word + ' ' in src_ln: |
|
| 354 | 1 | last_found = True |
|
| 355 | |||
| 356 | 1 | if not last_found: |
|
| 357 | 1 | for word in src1_words: |
|
| 358 | 1 | if tar_ln.endswith(word) or word + ' ' in tar_ln: |
|
| 359 | 1 | last_found = True |
|
| 360 | |||
| 361 | # 6 |
||
| 362 | 1 | matches = 0 |
|
| 363 | 1 | if last_found: |
|
| 364 | 1 | for i, s_word in enumerate(src1_words): |
|
| 365 | 1 | for j, t_word in enumerate(tar1_words): |
|
| 366 | 1 | if s_word == t_word: |
|
| 367 | 1 | src1_words[i] = '@' |
|
| 368 | 1 | tar1_words[j] = '@' |
|
| 369 | 1 | matches += 1 |
|
| 370 | 1 | w_ratio = matches / max(tar1_num_words, src1_num_words) |
|
| 371 | 1 | if matches > 1 or ( |
|
| 372 | matches == 1 |
||
| 373 | and src1_num_words == 1 |
||
| 374 | and tar1_num_words == 1 |
||
| 375 | and (tar_len_specials > 0 or src_len_specials > 0) |
||
| 376 | ): |
||
| 377 | 1 | return w_ratio |
|
| 378 | |||
| 379 | # 8 |
||
| 380 | 1 | if ( |
|
| 381 | tar2_num_words == 1 |
||
| 382 | and src2_num_words == 1 |
||
| 383 | and tar2_words[0] == src2_words[0] |
||
| 384 | ): |
||
| 385 | 1 | return 1 |
|
| 386 | # I see no way that the following can be True if the equivalent in |
||
| 387 | # #4 was False. |
||
| 388 | if tar2_num_words < 2 and tar_len_specials == 0: # pragma: no cover |
||
| 389 | return 0 |
||
| 390 | |||
| 391 | # 9 |
||
| 392 | 1 | last_found = False |
|
| 393 | 1 | for word in tar2_words: |
|
| 394 | 1 | if src_ln.endswith(word) or word + ' ' in src_ln: |
|
| 395 | 1 | last_found = True |
|
| 396 | |||
| 397 | 1 | if not last_found: |
|
| 398 | 1 | for word in src2_words: |
|
| 399 | 1 | if tar_ln.endswith(word) or word + ' ' in tar_ln: |
|
| 400 | 1 | last_found = True |
|
| 401 | |||
| 402 | 1 | if not last_found: |
|
| 403 | 1 | return 0 |
|
| 404 | |||
| 405 | # 10 |
||
| 406 | 1 | matches = 0 |
|
| 407 | 1 | if last_found: |
|
| 408 | 1 | for i, s_word in enumerate(src2_words): |
|
| 409 | 1 | for j, t_word in enumerate(tar2_words): |
|
| 410 | 1 | if s_word == t_word: |
|
| 411 | 1 | src2_words[i] = '@' |
|
| 412 | 1 | tar2_words[j] = '@' |
|
| 413 | 1 | matches += 1 |
|
| 414 | 1 | w_ratio = matches / max(tar2_num_words, src2_num_words) |
|
| 415 | 1 | if matches > 1 or ( |
|
| 416 | matches == 1 |
||
| 417 | and src2_num_words == 1 |
||
| 418 | and tar2_num_words == 1 |
||
| 419 | and (tar_len_specials > 0 or src_len_specials > 0) |
||
| 420 | ): |
||
| 421 | return w_ratio |
||
| 422 | |||
| 423 | 1 | return 0 |
|
| 424 | |||
| 425 | 1 | def dist_abs( |
|
| 426 | self, |
||
| 427 | src, |
||
| 428 | tar, |
||
| 429 | word_approx_min=0.3, |
||
| 430 | char_approx_min=0.73, |
||
| 431 | tests=2 ** 12 - 1, |
||
| 432 | ret_name=False, |
||
| 433 | ): |
||
| 434 | """Return the Synoname similarity type of two words. |
||
| 435 | |||
| 436 | Parameters |
||
| 437 | ---------- |
||
| 438 | src : str |
||
| 439 | Source string for comparison |
||
| 440 | tar : str |
||
| 441 | Target string for comparison |
||
| 442 | word_approx_min : float |
||
| 443 | The minimum word approximation value to signal a 'word_approx' |
||
| 444 | match |
||
| 445 | char_approx_min : float |
||
| 446 | The minimum character approximation value to signal a 'char_approx' |
||
| 447 | match |
||
| 448 | tests : int or Iterable |
||
| 449 | Either an integer indicating tests to perform or a list of test |
||
| 450 | names to perform (defaults to performing all tests) |
||
| 451 | ret_name : bool |
||
| 452 | If True, returns the match name rather than its integer equivalent |
||
| 453 | |||
| 454 | Returns |
||
| 455 | ------- |
||
| 456 | int (or str if ret_name is True) |
||
| 457 | Synoname value |
||
| 458 | |||
| 459 | Examples |
||
| 460 | -------- |
||
| 461 | >>> cmp = Synoname() |
||
| 462 | >>> cmp.dist_abs(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', '')) |
||
| 463 | 2 |
||
| 464 | >>> cmp.dist_abs(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''), |
||
| 465 | ... ret_name=True) |
||
| 466 | 'omission' |
||
| 467 | >>> cmp.dist_abs(('Dore', 'Gustave', ''), |
||
| 468 | ... ('Dore', 'Paul Gustave Louis Christophe', ''), ret_name=True) |
||
| 469 | 'inclusion' |
||
| 470 | >>> cmp.dist_abs(('Pereira', 'I. R.', ''), ('Pereira', 'I. Smith', ''), |
||
| 471 | ... ret_name=True) |
||
| 472 | 'word_approx' |
||
| 473 | |||
| 474 | """ |
||
| 475 | 1 | if isinstance(tests, Iterable): |
|
| 476 | 1 | new_tests = 0 |
|
| 477 | 1 | for term in tests: |
|
| 478 | 1 | if term in self._test_dict: |
|
| 479 | 1 | new_tests += self._test_dict[term] |
|
| 480 | 1 | tests = new_tests |
|
| 481 | |||
| 482 | 1 | if isinstance(src, tuple): |
|
| 483 | 1 | src_ln, src_fn, src_qual = src |
|
| 484 | 1 | elif '#' in src: |
|
| 485 | 1 | src_ln, src_fn, src_qual = src.split('#')[-3:] |
|
| 486 | else: |
||
| 487 | 1 | src_ln, src_fn, src_qual = src, '', '' |
|
| 488 | |||
| 489 | 1 | if isinstance(tar, tuple): |
|
| 490 | 1 | tar_ln, tar_fn, tar_qual = tar |
|
| 491 | 1 | elif '#' in tar: |
|
| 492 | 1 | tar_ln, tar_fn, tar_qual = tar.split('#')[-3:] |
|
| 493 | else: |
||
| 494 | 1 | tar_ln, tar_fn, tar_qual = tar, '', '' |
|
| 495 | |||
| 496 | 1 | def _split_special(spec): |
|
| 497 | 1 | spec_list = [] |
|
| 498 | 1 | while spec: |
|
| 499 | 1 | spec_list.append((int(spec[:3]), spec[3:4])) |
|
| 500 | 1 | spec = spec[4:] |
|
| 501 | 1 | return spec_list |
|
| 502 | |||
| 503 | 1 | def _fmt_retval(val): |
|
| 504 | 1 | if ret_name: |
|
| 505 | 1 | return self._match_name[val] |
|
| 506 | 1 | return val |
|
| 507 | |||
| 508 | # 1. Preprocessing |
||
| 509 | |||
| 510 | # Lowercasing |
||
| 511 | 1 | src_fn = src_fn.strip().lower() |
|
| 512 | 1 | src_ln = src_ln.strip().lower() |
|
| 513 | 1 | src_qual = src_qual.strip().lower() |
|
| 514 | |||
| 515 | 1 | tar_fn = tar_fn.strip().lower() |
|
| 516 | 1 | tar_ln = tar_ln.strip().lower() |
|
| 517 | 1 | tar_qual = tar_qual.strip().lower() |
|
| 518 | |||
| 519 | # Create toolcodes |
||
| 520 | 1 | src_ln, src_fn, src_tc = self._stc.fingerprint( |
|
| 521 | src_ln, src_fn, src_qual |
||
| 522 | ) |
||
| 523 | 1 | tar_ln, tar_fn, tar_tc = self._stc.fingerprint( |
|
| 524 | tar_ln, tar_fn, tar_qual |
||
| 525 | ) |
||
| 526 | |||
| 527 | 1 | src_generation = int(src_tc[2]) |
|
| 528 | 1 | src_romancode = int(src_tc[3:6]) |
|
| 529 | 1 | src_len_fn = int(src_tc[6:8]) |
|
| 530 | 1 | src_tc = src_tc.split('$') |
|
| 531 | 1 | src_specials = _split_special(src_tc[1]) |
|
| 532 | |||
| 533 | 1 | tar_generation = int(tar_tc[2]) |
|
| 534 | 1 | tar_romancode = int(tar_tc[3:6]) |
|
| 535 | 1 | tar_len_fn = int(tar_tc[6:8]) |
|
| 536 | 1 | tar_tc = tar_tc.split('$') |
|
| 537 | 1 | tar_specials = _split_special(tar_tc[1]) |
|
| 538 | |||
| 539 | 1 | gen_conflict = (src_generation != tar_generation) and bool( |
|
| 540 | src_generation or tar_generation |
||
| 541 | ) |
||
| 542 | 1 | roman_conflict = (src_romancode != tar_romancode) and bool( |
|
| 543 | src_romancode or tar_romancode |
||
| 544 | ) |
||
| 545 | |||
| 546 | 1 | ln_equal = src_ln == tar_ln |
|
| 547 | 1 | fn_equal = src_fn == tar_fn |
|
| 548 | |||
| 549 | # approx_c |
||
| 550 | 1 | def _approx_c(): |
|
| 551 | 1 | if gen_conflict or roman_conflict: |
|
| 552 | 1 | return False, 0 |
|
| 553 | |||
| 554 | 1 | full_src = ' '.join((src_ln, src_fn)) |
|
| 555 | 1 | if full_src.startswith('master '): |
|
| 556 | 1 | full_src = full_src[len('master ') :] |
|
| 557 | 1 | for intro in [ |
|
| 558 | 'of the ', |
||
| 559 | 'of ', |
||
| 560 | 'known as the ', |
||
| 561 | 'with the ', |
||
| 562 | 'with ', |
||
| 563 | ]: |
||
| 564 | 1 | if full_src.startswith(intro): |
|
| 565 | 1 | full_src = full_src[len(intro) :] |
|
| 566 | |||
| 567 | 1 | full_tar = ' '.join((tar_ln, tar_fn)) |
|
| 568 | 1 | if full_tar.startswith('master '): |
|
| 569 | 1 | full_tar = full_tar[len('master ') :] |
|
| 570 | 1 | for intro in [ |
|
| 571 | 'of the ', |
||
| 572 | 'of ', |
||
| 573 | 'known as the ', |
||
| 574 | 'with the ', |
||
| 575 | 'with ', |
||
| 576 | ]: |
||
| 577 | 1 | if full_tar.startswith(intro): |
|
| 578 | 1 | full_tar = full_tar[len(intro) :] |
|
| 579 | |||
| 580 | 1 | loc_ratio = sim_ratcliff_obershelp(full_src, full_tar) |
|
| 581 | 1 | return loc_ratio >= char_approx_min, loc_ratio |
|
| 582 | |||
| 583 | 1 | approx_c_result, ca_ratio = _approx_c() |
|
| 584 | |||
| 585 | 1 | if tests & self._test_dict['exact'] and fn_equal and ln_equal: |
|
| 586 | 1 | return _fmt_retval(self._match_type_dict['exact']) |
|
| 587 | 1 | View Code Duplication | if tests & self._test_dict['omission']: |
| 588 | 1 | if ( |
|
| 589 | fn_equal |
||
| 590 | and levenshtein(src_ln, tar_ln, cost=(1, 1, 99, 99)) == 1 |
||
| 591 | ): |
||
| 592 | 1 | if not roman_conflict: |
|
| 593 | 1 | return _fmt_retval(self._match_type_dict['omission']) |
|
| 594 | 1 | elif ( |
|
| 595 | ln_equal |
||
| 596 | and levenshtein(src_fn, tar_fn, cost=(1, 1, 99, 99)) == 1 |
||
| 597 | ): |
||
| 598 | 1 | return _fmt_retval(self._match_type_dict['omission']) |
|
| 599 | 1 | View Code Duplication | if tests & self._test_dict['substitution']: |
| 600 | 1 | if ( |
|
| 601 | fn_equal |
||
| 602 | and levenshtein(src_ln, tar_ln, cost=(99, 99, 1, 99)) == 1 |
||
| 603 | ): |
||
| 604 | 1 | return _fmt_retval(self._match_type_dict['substitution']) |
|
| 605 | 1 | elif ( |
|
| 606 | ln_equal |
||
| 607 | and levenshtein(src_fn, tar_fn, cost=(99, 99, 1, 99)) == 1 |
||
| 608 | ): |
||
| 609 | 1 | return _fmt_retval(self._match_type_dict['substitution']) |
|
| 610 | 1 | View Code Duplication | if tests & self._test_dict['transposition']: |
| 611 | 1 | if fn_equal and ( |
|
| 612 | levenshtein(src_ln, tar_ln, mode='osa', cost=(99, 99, 99, 1)) |
||
| 613 | == 1 |
||
| 614 | ): |
||
| 615 | 1 | return _fmt_retval(self._match_type_dict['transposition']) |
|
| 616 | 1 | elif ln_equal and ( |
|
| 617 | levenshtein(src_fn, tar_fn, mode='osa', cost=(99, 99, 99, 1)) |
||
| 618 | == 1 |
||
| 619 | ): |
||
| 620 | 1 | return _fmt_retval(self._match_type_dict['transposition']) |
|
| 621 | 1 | if tests & self._test_dict['punctuation']: |
|
| 622 | 1 | np_src_fn = self._synoname_strip_punct(src_fn) |
|
| 623 | 1 | np_tar_fn = self._synoname_strip_punct(tar_fn) |
|
| 624 | 1 | np_src_ln = self._synoname_strip_punct(src_ln) |
|
| 625 | 1 | np_tar_ln = self._synoname_strip_punct(tar_ln) |
|
| 626 | |||
| 627 | 1 | if (np_src_fn == np_tar_fn) and (np_src_ln == np_tar_ln): |
|
| 628 | 1 | return _fmt_retval(self._match_type_dict['punctuation']) |
|
| 629 | |||
| 630 | 1 | np_src_fn = self._synoname_strip_punct(src_fn.replace('-', ' ')) |
|
| 631 | 1 | np_tar_fn = self._synoname_strip_punct(tar_fn.replace('-', ' ')) |
|
| 632 | 1 | np_src_ln = self._synoname_strip_punct(src_ln.replace('-', ' ')) |
|
| 633 | 1 | np_tar_ln = self._synoname_strip_punct(tar_ln.replace('-', ' ')) |
|
| 634 | |||
| 635 | 1 | if (np_src_fn == np_tar_fn) and (np_src_ln == np_tar_ln): |
|
| 636 | 1 | return _fmt_retval(self._match_type_dict['punctuation']) |
|
| 637 | |||
| 638 | 1 | if tests & self._test_dict['initials'] and ln_equal: |
|
| 639 | 1 | if src_fn and tar_fn: |
|
| 640 | 1 | src_initials = self._synoname_strip_punct(src_fn).split() |
|
| 641 | 1 | tar_initials = self._synoname_strip_punct(tar_fn).split() |
|
| 642 | 1 | initials = bool( |
|
| 643 | (len(src_initials) == len(''.join(src_initials))) |
||
| 644 | or (len(tar_initials) == len(''.join(tar_initials))) |
||
| 645 | ) |
||
| 646 | 1 | if initials: |
|
| 647 | 1 | src_initials = ''.join(_[0] for _ in src_initials) |
|
| 648 | 1 | tar_initials = ''.join(_[0] for _ in tar_initials) |
|
| 649 | 1 | if src_initials == tar_initials: |
|
| 650 | 1 | return _fmt_retval(self._match_type_dict['initials']) |
|
| 651 | 1 | initial_diff = abs(len(src_initials) - len(tar_initials)) |
|
| 652 | 1 | if initial_diff and ( |
|
| 653 | ( |
||
| 654 | initial_diff |
||
| 655 | == levenshtein( |
||
| 656 | src_initials, |
||
| 657 | tar_initials, |
||
| 658 | cost=(1, 99, 99, 99), |
||
| 659 | ) |
||
| 660 | ) |
||
| 661 | or ( |
||
| 662 | initial_diff |
||
| 663 | == levenshtein( |
||
| 664 | tar_initials, |
||
| 665 | src_initials, |
||
| 666 | cost=(1, 99, 99, 99), |
||
| 667 | ) |
||
| 668 | ) |
||
| 669 | ): |
||
| 670 | 1 | return _fmt_retval(self._match_type_dict['initials']) |
|
| 671 | 1 | if tests & self._test_dict['extension']: |
|
| 672 | 1 | if src_ln[1] == tar_ln[1] and ( |
|
| 673 | src_ln.startswith(tar_ln) or tar_ln.startswith(src_ln) |
||
| 674 | ): |
||
| 675 | 1 | if ( |
|
| 676 | (not src_len_fn and not tar_len_fn) |
||
| 677 | or (tar_fn and src_fn.startswith(tar_fn)) |
||
| 678 | or (src_fn and tar_fn.startswith(src_fn)) |
||
| 679 | ) and not roman_conflict: |
||
| 680 | 1 | return _fmt_retval(self._match_type_dict['extension']) |
|
| 681 | 1 | if tests & self._test_dict['inclusion'] and ln_equal: |
|
| 682 | 1 | if (src_fn and src_fn in tar_fn) or (tar_fn and tar_fn in src_ln): |
|
| 683 | 1 | return _fmt_retval(self._match_type_dict['inclusion']) |
|
| 684 | 1 | if tests & self._test_dict['no_first'] and ln_equal: |
|
| 685 | 1 | if src_fn == '' or tar_fn == '': |
|
| 686 | 1 | return _fmt_retval(self._match_type_dict['no_first']) |
|
| 687 | 1 | if tests & self._test_dict['word_approx']: |
|
| 688 | 1 | ratio = self._synoname_word_approximation( |
|
| 689 | src_ln, |
||
| 690 | tar_ln, |
||
| 691 | src_fn, |
||
| 692 | tar_fn, |
||
| 693 | { |
||
| 694 | 'gen_conflict': gen_conflict, |
||
| 695 | 'roman_conflict': roman_conflict, |
||
| 696 | 'src_specials': src_specials, |
||
| 697 | 'tar_specials': tar_specials, |
||
| 698 | }, |
||
| 699 | ) |
||
| 700 | 1 | if ratio == 1 and tests & self._test_dict['confusions']: |
|
| 701 | 1 | if ( |
|
| 702 | ' '.join((src_fn, src_ln)).strip() |
||
| 703 | == ' '.join((tar_fn, tar_ln)).strip() |
||
| 704 | ): |
||
| 705 | 1 | return _fmt_retval(self._match_type_dict['confusions']) |
|
| 706 | 1 | if ratio >= word_approx_min: |
|
| 707 | 1 | return _fmt_retval(self._match_type_dict['word_approx']) |
|
| 708 | 1 | if tests & self._test_dict['char_approx']: |
|
| 709 | 1 | if ca_ratio >= char_approx_min: |
|
| 710 | 1 | return _fmt_retval(self._match_type_dict['char_approx']) |
|
| 711 | 1 | return _fmt_retval(self._match_type_dict['no_match']) |
|
| 712 | |||
| 713 | 1 | def dist( |
|
| 714 | self, |
||
| 715 | src, |
||
| 716 | tar, |
||
| 717 | word_approx_min=0.3, |
||
| 718 | char_approx_min=0.73, |
||
| 719 | tests=2 ** 12 - 1, |
||
| 720 | ): |
||
| 721 | """Return the normalized Synoname distance between two words. |
||
| 722 | |||
| 723 | Parameters |
||
| 724 | ---------- |
||
| 725 | src : str |
||
| 726 | Source string for comparison |
||
| 727 | tar : str |
||
| 728 | Target string for comparison |
||
| 729 | word_approx_min : float |
||
| 730 | The minimum word approximation value to signal a 'word_approx' |
||
| 731 | match |
||
| 732 | char_approx_min : float |
||
| 733 | The minimum character approximation value to signal a 'char_approx' |
||
| 734 | match |
||
| 735 | tests : int or Iterable |
||
| 736 | Either an integer indicating tests to perform or a list of test |
||
| 737 | names to perform (defaults to performing all tests) |
||
| 738 | |||
| 739 | Returns |
||
| 740 | ------- |
||
| 741 | float |
||
| 742 | Normalized Synoname distance |
||
| 743 | |||
| 744 | """ |
||
| 745 | 1 | return ( |
|
| 746 | synoname(src, tar, word_approx_min, char_approx_min, tests, False) |
||
| 747 | / 14 |
||
| 748 | ) |
||
| 749 | |||
| 750 | |||
| 751 | 1 | def synoname( |
|
| 752 | src, |
||
| 753 | tar, |
||
| 754 | word_approx_min=0.3, |
||
| 755 | char_approx_min=0.73, |
||
| 756 | tests=2 ** 12 - 1, |
||
| 757 | ret_name=False, |
||
| 758 | ): |
||
| 759 | """Return the Synoname similarity type of two words. |
||
| 760 | |||
| 761 | This is a wrapper for :py:meth:`Synoname.dist_abs`. |
||
| 762 | |||
| 763 | Parameters |
||
| 764 | ---------- |
||
| 765 | src : str |
||
| 766 | Source string for comparison |
||
| 767 | tar : str |
||
| 768 | Target string for comparison |
||
| 769 | word_approx_min : float |
||
| 770 | The minimum word approximation value to signal a 'word_approx' match |
||
| 771 | char_approx_min : float |
||
| 772 | The minimum character approximation value to signal a 'char_approx' |
||
| 773 | match |
||
| 774 | tests : int or Iterable |
||
| 775 | Either an integer indicating tests to perform or a list of test names |
||
| 776 | to perform (defaults to performing all tests) |
||
| 777 | ret_name : bool |
||
| 778 | If True, returns the match name rather than its integer equivalent |
||
| 779 | |||
| 780 | Returns |
||
| 781 | ------- |
||
| 782 | int (or str if ret_name is True) |
||
| 783 | Synoname value |
||
| 784 | |||
| 785 | Examples |
||
| 786 | -------- |
||
| 787 | >>> synoname(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', '')) |
||
| 788 | 2 |
||
| 789 | >>> synoname(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''), |
||
| 790 | ... ret_name=True) |
||
| 791 | 'omission' |
||
| 792 | >>> synoname(('Dore', 'Gustave', ''), |
||
| 793 | ... ('Dore', 'Paul Gustave Louis Christophe', ''), ret_name=True) |
||
| 794 | 'inclusion' |
||
| 795 | >>> synoname(('Pereira', 'I. R.', ''), ('Pereira', 'I. Smith', ''), |
||
| 796 | ... ret_name=True) |
||
| 797 | 'word_approx' |
||
| 798 | |||
| 799 | """ |
||
| 800 | 1 | return Synoname().dist_abs( |
|
| 801 | src, tar, word_approx_min, char_approx_min, tests, ret_name |
||
| 802 | ) |
||
| 803 | |||
| 804 | |||
| 805 | if __name__ == '__main__': |
||
| 806 | import doctest |
||
| 807 | |||
| 808 | doctest.testmod() |
||
| 809 |