Test Failed
Push — master ( 64abe2...a464fa )
by Chris
04:02 queued 11s
created

_synoname_word_approximation()   F

Complexity

Conditions 73

Size

Total Lines 170
Code Lines 123

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 73
eloc 123
nop 5
dl 0
loc 170
rs 0
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like abydos.distance.synoname._synoname_word_approximation() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.distance.synoname.
20
21
The distance.synoname module implements Synoname.
22
"""
23
24
from __future__ import division, unicode_literals
25
26
from collections import Iterable
27
28
from .levenshtein import levenshtein
29
from .sequence import sim_ratcliff_obershelp
30
# noinspection PyProtectedMember
31
from ..fingerprint.synoname import _synoname_special_table, synoname_toolcode
32
33
__all__ = ['synoname']
34
35
36
def _synoname_strip_punct(word):
37
    """Return a word with punctuation stripped out.
38
39
    :param word: a word to strip punctuation from
40
    :returns: The word stripped of punctuation
41
42
    >>> _synoname_strip_punct('AB;CD EF-GH$IJ')
43
    'ABCD EFGHIJ'
44
    """
45
    stripped = ''
46
    for char in word:
47
        if char not in set(',-./:;"&\'()!{|}?$%*+<=>[\\]^_`~'):
48
            stripped += char
49
    return stripped.strip()
50
51
52
def _synoname_word_approximation(src_ln, tar_ln, src_fn='', tar_fn='',
53
                                 features=None):
54
    """Return the Synoname word approximation score for two names.
55
56
    :param str src_ln: last name of the source
57
    :param str tar_ln: last name of the target
58
    :param str src_fn: first name of the source (optional)
59
    :param str tar_fn: first name of the target (optional)
60
    :param features: a dict containing special features calculated via
61
        fingerprint.synoname_toolcode() (optional)
62
    :returns: The word approximation score
63
    :rtype: float
64
65
    >>> _synoname_word_approximation('Smith Waterman', 'Waterman',
66
    ... 'Tom Joe Bob', 'Tom Joe')
67
    0.6
68
    """
69
    if features is None:
70
        features = {}
71
    if 'src_specials' not in features:
72
        features['src_specials'] = []
73
    if 'tar_specials' not in features:
74
        features['tar_specials'] = []
75
76
    src_len_specials = len(features['src_specials'])
77
    tar_len_specials = len(features['tar_specials'])
78
79
    # 1
80
    if (('gen_conflict' in features and features['gen_conflict']) or
81
            ('roman_conflict' in features and features['roman_conflict'])):
82
        return 0
83
84
    # 3 & 7
85
    full_tar1 = ' '.join((tar_ln, tar_fn)).replace('-', ' ').strip()
86
    for s_pos, s_type in features['tar_specials']:
87
        if s_type == 'a':
88
            full_tar1 = full_tar1[:-(1+len(_synoname_special_table[s_pos][1]))]
89
        elif s_type == 'b':
90
            loc = full_tar1.find(' '+_synoname_special_table[s_pos][1]+' ')+1
91
            full_tar1 = (full_tar1[:loc] +
92
                         full_tar1[loc +
93
                                   len(_synoname_special_table[s_pos][1]):])
94
        elif s_type == 'c':
95
            full_tar1 = full_tar1[1+len(_synoname_special_table[s_pos][1]):]
96
97
    full_src1 = ' '.join((src_ln, src_fn)).replace('-', ' ').strip()
98
    for s_pos, s_type in features['src_specials']:
99
        if s_type == 'a':
100
            full_src1 = full_src1[:-(1+len(_synoname_special_table[s_pos][1]))]
101
        elif s_type == 'b':
102
            loc = full_src1.find(' '+_synoname_special_table[s_pos][1]+' ')+1
103
            full_src1 = (full_src1[:loc] +
104
                         full_src1[loc +
105
                                   len(_synoname_special_table[s_pos][1]):])
106
        elif s_type == 'c':
107
            full_src1 = full_src1[1+len(_synoname_special_table[s_pos][1]):]
108
109
    full_tar2 = full_tar1
110
    for s_pos, s_type in features['tar_specials']:
111
        if s_type == 'd':
112
            full_tar2 = full_tar2[len(_synoname_special_table[s_pos][1]):]
113
        elif s_type == 'X' and _synoname_special_table[s_pos][1] in full_tar2:
114
            loc = full_tar2.find(' '+_synoname_special_table[s_pos][1])
115
            full_tar2 = (full_tar2[:loc] +
116
                         full_tar2[loc +
117
                                   len(_synoname_special_table[s_pos][1]):])
118
119
    full_src2 = full_src1
120
    for s_pos, s_type in features['src_specials']:
121
        if s_type == 'd':
122
            full_src2 = full_src2[len(_synoname_special_table[s_pos][1]):]
123
        elif s_type == 'X' and _synoname_special_table[s_pos][1] in full_src2:
124
            loc = full_src2.find(' '+_synoname_special_table[s_pos][1])
125
            full_src2 = (full_src2[:loc] +
126
                         full_src2[loc +
127
                                   len(_synoname_special_table[s_pos][1]):])
128
129
    full_tar1 = _synoname_strip_punct(full_tar1)
130
    tar1_words = full_tar1.split()
131
    tar1_num_words = len(tar1_words)
132
133
    full_src1 = _synoname_strip_punct(full_src1)
134
    src1_words = full_src1.split()
135
    src1_num_words = len(src1_words)
136
137
    full_tar2 = _synoname_strip_punct(full_tar2)
138
    tar2_words = full_tar2.split()
139
    tar2_num_words = len(tar2_words)
140
141
    full_src2 = _synoname_strip_punct(full_src2)
142
    src2_words = full_src2.split()
143
    src2_num_words = len(src2_words)
144
145
    # 2
146
    if (src1_num_words < 2 and src_len_specials == 0 and src2_num_words < 2 and
147
            tar_len_specials == 0):
148
        return 0
149
150
    # 4
151
    if (tar1_num_words == 1 and src1_num_words == 1 and
152
            tar1_words[0] == src1_words[0]):
153
        return 1
154
    if tar1_num_words < 2 and tar_len_specials == 0:
155
        return 0
156
157
    # 5
158
    last_found = False
159
    for word in tar1_words:
160
        if src_ln.endswith(word) or word+' ' in src_ln:
161
            last_found = True
162
163
    if not last_found:
164
        for word in src1_words:
165
            if tar_ln.endswith(word) or word+' ' in tar_ln:
166
                last_found = True
167
168
    # 6
169
    matches = 0
170
    if last_found:
171
        for i, s_word in enumerate(src1_words):
172
            for j, t_word in enumerate(tar1_words):
173
                if s_word == t_word:
174
                    src1_words[i] = '@'
175
                    tar1_words[j] = '@'
176
                    matches += 1
177
    w_ratio = matches/max(tar1_num_words, src1_num_words)
178
    if matches > 1 or (matches == 1 and
179
                       src1_num_words == 1 and tar1_num_words == 1 and
180
                       (tar_len_specials > 0 or src_len_specials > 0)):
181
        return w_ratio
182
183
    # 8
184
    if (tar2_num_words == 1 and src2_num_words == 1 and
185
            tar2_words[0] == src2_words[0]):
186
        return 1
187
    # I see no way that the following can be True if the equivalent in
188
    # #4 was False.
189
    if tar2_num_words < 2 and tar_len_specials == 0:  # pragma: no cover
190
        return 0
191
192
    # 9
193
    last_found = False
194
    for word in tar2_words:
195
        if src_ln.endswith(word) or word+' ' in src_ln:
196
            last_found = True
197
198
    if not last_found:
199
        for word in src2_words:
200
            if tar_ln.endswith(word) or word+' ' in tar_ln:
201
                last_found = True
202
203
    if not last_found:
204
        return 0
205
206
    # 10
207
    matches = 0
208
    if last_found:
209
        for i, s_word in enumerate(src2_words):
210
            for j, t_word in enumerate(tar2_words):
211
                if s_word == t_word:
212
                    src2_words[i] = '@'
213
                    tar2_words[j] = '@'
214
                    matches += 1
215
    w_ratio = matches/max(tar2_num_words, src2_num_words)
216
    if matches > 1 or (matches == 1 and
217
                       src2_num_words == 1 and tar2_num_words == 1 and
218
                       (tar_len_specials > 0 or src_len_specials > 0)):
219
        return w_ratio
220
221
    return 0
222
223
224
def synoname(src, tar, word_approx_min=0.3, char_approx_min=0.73,
225
             tests=2**12-1, ret_name=False):
226
    """Return the Synoname similarity type of two words.
227
228
    Cf. :cite:`Getty:1991,Gross:1991`
229
230
    :param str src: source string for comparison
231
    :param str tar: target string for comparison
232
    :param bool ret_name: return the name of the match type rather than the
233
        int value
234
    :param float word_approx_min: the minimum word approximation value to
235
        signal a 'word_approx' match
236
    :param float char_approx_min: the minimum character approximation value to
237
        signal a 'char_approx' match
238
    :param int or Iterable tests: either an integer indicating tests to
239
        perform or a list of test names to perform (defaults to performing all
240
        tests)
241
    :param bool ret_name: if True, returns the match name rather than its
242
        integer equivalent
243
    :returns: Synoname value
244
    :rtype: int (or str if ret_name is True)
245
246
    >>> synoname(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''))
247
    2
248
    >>> synoname(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''),
249
    ... ret_name=True)
250
    'omission'
251
    >>> synoname(('Dore', 'Gustave', ''),
252
    ... ('Dore', 'Paul Gustave Louis Christophe', ''),
253
    ... ret_name=True)
254
    'inclusion'
255
    >>> synoname(('Pereira', 'I. R.', ''), ('Pereira', 'I. Smith', ''),
256
    ... ret_name=True)
257
    'word_approx'
258
    """
259
    test_dict = {val: 2**n for n, val in enumerate([
260
        'exact', 'omission', 'substitution', 'transposition', 'punctuation',
261
        'initials', 'extension', 'inclusion', 'no_first', 'word_approx',
262
        'confusions', 'char_approx'])}
263
    match_name = ['', 'exact', 'omission', 'substitution', 'transposition',
264
                  'punctuation', 'initials', 'extension', 'inclusion',
265
                  'no_first', 'word_approx', 'confusions', 'char_approx',
266
                  'no_match']
267
    match_type_dict = {val: n for n, val in enumerate(match_name)}
268
269
    if isinstance(tests, Iterable):
270
        new_tests = 0
271
        for term in tests:
272
            if term in test_dict:
273
                new_tests += test_dict[term]
274
        tests = new_tests
275
276
    if isinstance(src, tuple):
277
        src_ln, src_fn, src_qual = src
278
    elif '#' in src:
279
        src_ln, src_fn, src_qual = src.split('#')[-3:]
280
    else:
281
        src_ln, src_fn, src_qual = src, '', ''
282
283
    if isinstance(tar, tuple):
284
        tar_ln, tar_fn, tar_qual = tar
285
    elif '#' in tar:
286
        tar_ln, tar_fn, tar_qual = tar.split('#')[-3:]
287
    else:
288
        tar_ln, tar_fn, tar_qual = tar, '', ''
289
290
    def _split_special(spec):
291
        spec_list = []
292
        while spec:
293
            spec_list.append((int(spec[:3]), spec[3:4]))
294
            spec = spec[4:]
295
        return spec_list
296
297
    def _fmt_retval(val):
298
        if ret_name:
299
            return match_name[val]
300
        return val
301
302
    # 1. Preprocessing
303
304
    # Lowercasing
305
    src_fn = src_fn.strip().lower()
306
    src_ln = src_ln.strip().lower()
307
    src_qual = src_qual.strip().lower()
308
309
    tar_fn = tar_fn.strip().lower()
310
    tar_ln = tar_ln.strip().lower()
311
    tar_qual = tar_qual.strip().lower()
312
313
    # Create toolcodes
314
    src_ln, src_fn, src_tc = synoname_toolcode(src_ln, src_fn, src_qual)
315
    tar_ln, tar_fn, tar_tc = synoname_toolcode(tar_ln, tar_fn, tar_qual)
316
317
    src_generation = int(src_tc[2])
318
    src_romancode = int(src_tc[3:6])
319
    src_len_fn = int(src_tc[6:8])
320
    src_tc = src_tc.split('$')
321
    src_specials = _split_special(src_tc[1])
322
323
    tar_generation = int(tar_tc[2])
324
    tar_romancode = int(tar_tc[3:6])
325
    tar_len_fn = int(tar_tc[6:8])
326
    tar_tc = tar_tc.split('$')
327
    tar_specials = _split_special(tar_tc[1])
328
329
    gen_conflict = ((src_generation != tar_generation) and
330
                    bool(src_generation or tar_generation))
331
    roman_conflict = ((src_romancode != tar_romancode) and
332
                      bool(src_romancode or tar_romancode))
333
334
    ln_equal = src_ln == tar_ln
335
    fn_equal = src_fn == tar_fn
336
337
    # approx_c
338
    def _approx_c():
339
        if gen_conflict or roman_conflict:
340
            return False, 0
341
342
        full_src = ' '.join((src_ln, src_fn))
343
        if full_src.startswith('master '):
344
            full_src = full_src[len('master '):]
345
            for intro in ['of the ', 'of ', 'known as the ', 'with the ',
346
                          'with ']:
347
                if full_src.startswith(intro):
348
                    full_src = full_src[len(intro):]
349
350
        full_tar = ' '.join((tar_ln, tar_fn))
351
        if full_tar.startswith('master '):
352
            full_tar = full_tar[len('master '):]
353
            for intro in ['of the ', 'of ', 'known as the ', 'with the ',
354
                          'with ']:
355
                if full_tar.startswith(intro):
356
                    full_tar = full_tar[len(intro):]
357
358
        loc_ratio = sim_ratcliff_obershelp(full_src, full_tar)
359
        return loc_ratio >= char_approx_min, loc_ratio
360
361
    approx_c_result, ca_ratio = _approx_c()
362
363
    if tests & test_dict['exact'] and fn_equal and ln_equal:
364
        return _fmt_retval(match_type_dict['exact'])
365
    if tests & test_dict['omission']:
366
        if (fn_equal and
367
                levenshtein(src_ln, tar_ln, cost=(1, 1, 99, 99)) == 1):
368
            if not roman_conflict:
369
                return _fmt_retval(match_type_dict['omission'])
370
        elif (ln_equal and
371
              levenshtein(src_fn, tar_fn, cost=(1, 1, 99, 99)) == 1):
372
            return _fmt_retval(match_type_dict['omission'])
373
    if tests & test_dict['substitution']:
374
        if (fn_equal and
375
                levenshtein(src_ln, tar_ln, cost=(99, 99, 1, 99)) == 1):
376
            return _fmt_retval(match_type_dict['substitution'])
377
        elif (ln_equal and
378
              levenshtein(src_fn, tar_fn, cost=(99, 99, 1, 99)) == 1):
379
            return _fmt_retval(match_type_dict['substitution'])
380
    if tests & test_dict['transposition']:
381
        if (fn_equal and
382
                (levenshtein(src_ln, tar_ln, mode='osa', cost=(99, 99, 99, 1))
383
                 == 1)):
384
            return _fmt_retval(match_type_dict['transposition'])
385
        elif (ln_equal and
386
              (levenshtein(src_fn, tar_fn, mode='osa', cost=(99, 99, 99, 1))
387
               == 1)):
388
            return _fmt_retval(match_type_dict['transposition'])
389
    if tests & test_dict['punctuation']:
390
        np_src_fn = _synoname_strip_punct(src_fn)
391
        np_tar_fn = _synoname_strip_punct(tar_fn)
392
        np_src_ln = _synoname_strip_punct(src_ln)
393
        np_tar_ln = _synoname_strip_punct(tar_ln)
394
395
        if (np_src_fn == np_tar_fn) and (np_src_ln == np_tar_ln):
396
            return _fmt_retval(match_type_dict['punctuation'])
397
398
        np_src_fn = _synoname_strip_punct(src_fn.replace('-', ' '))
399
        np_tar_fn = _synoname_strip_punct(tar_fn.replace('-', ' '))
400
        np_src_ln = _synoname_strip_punct(src_ln.replace('-', ' '))
401
        np_tar_ln = _synoname_strip_punct(tar_ln.replace('-', ' '))
402
403
        if (np_src_fn == np_tar_fn) and (np_src_ln == np_tar_ln):
404
            return _fmt_retval(match_type_dict['punctuation'])
405
406
    if tests & test_dict['initials'] and ln_equal:
407
        if src_fn and tar_fn:
408
            src_initials = _synoname_strip_punct(src_fn).split()
409
            tar_initials = _synoname_strip_punct(tar_fn).split()
410
            initials = bool((len(src_initials) == len(''.join(src_initials)))
411
                            or
412
                            (len(tar_initials) == len(''.join(tar_initials))))
413
            if initials:
414
                src_initials = ''.join(_[0] for _ in src_initials)
415
                tar_initials = ''.join(_[0] for _ in tar_initials)
416
                if src_initials == tar_initials:
417
                    return _fmt_retval(match_type_dict['initials'])
418
                initial_diff = abs(len(src_initials)-len(tar_initials))
419
                if (initial_diff and
420
                        ((initial_diff ==
421
                          levenshtein(src_initials, tar_initials,
422
                                      cost=(1, 99, 99, 99))) or
423
                         (initial_diff ==
424
                          levenshtein(tar_initials, src_initials,
425
                                      cost=(1, 99, 99, 99))))):
426
                    return _fmt_retval(match_type_dict['initials'])
427
    if tests & test_dict['extension']:
428
        if src_ln[1] == tar_ln[1] and (src_ln.startswith(tar_ln) or
429
                                       tar_ln.startswith(src_ln)):
430
            if (((not src_len_fn and not tar_len_fn) or
431
                 (tar_fn and src_fn.startswith(tar_fn)) or
432
                 (src_fn and tar_fn.startswith(src_fn)))
433
                    and not roman_conflict):
434
                return _fmt_retval(match_type_dict['extension'])
435
    if tests & test_dict['inclusion'] and ln_equal:
436
        if (src_fn and src_fn in tar_fn) or (tar_fn and tar_fn in src_ln):
437
            return _fmt_retval(match_type_dict['inclusion'])
438
    if tests & test_dict['no_first'] and ln_equal:
439
        if src_fn == '' or tar_fn == '':
440
            return _fmt_retval(match_type_dict['no_first'])
441
    if tests & test_dict['word_approx']:
442
        ratio = _synoname_word_approximation(src_ln, tar_ln, src_fn, tar_fn,
443
                                             {'gen_conflict': gen_conflict,
444
                                              'roman_conflict': roman_conflict,
445
                                              'src_specials': src_specials,
446
                                              'tar_specials': tar_specials})
447
        if ratio == 1 and tests & test_dict['confusions']:
448
            if (' '.join((src_fn, src_ln)).strip() ==
449
                    ' '.join((tar_fn, tar_ln)).strip()):
450
                return _fmt_retval(match_type_dict['confusions'])
451
        if ratio >= word_approx_min:
452
            return _fmt_retval(match_type_dict['word_approx'])
453
    if tests & test_dict['char_approx']:
454
        if ca_ratio >= char_approx_min:
455
            return _fmt_retval(match_type_dict['char_approx'])
456
    return _fmt_retval(match_type_dict['no_match'])
457
458
459
if __name__ == '__main__':
460
    import doctest
461
    doctest.testmod()
462