Completed
Pull Request — master (#141)
by Chris
13:03
created

abydos.distance._synoname.synoname()   A

Complexity

Conditions 1

Size

Total Lines 45
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 9
dl 0
loc 45
ccs 2
cts 2
cp 1
rs 9.95
c 0
b 0
f 0
cc 1
nop 6
crap 1
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance.synoname.
20
21
The distance.synoname module implements Synoname.
22
"""
23
24 1
from __future__ import division, unicode_literals
25
26 1
from collections import Iterable
27
28 1
from ._distance import Distance
29 1
from ._levenshtein import levenshtein
30 1
from ._sequence import sim_ratcliff_obershelp
31
32
# noinspection PyProtectedMember
33 1
from ..fingerprint._synoname import SynonameToolcode
34
35 1
__all__ = ['Synoname', 'synoname']
36
37
38 1
class Synoname(Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
39
    """Synoname.
40
41
    Cf. :cite:`Getty:1991,Gross:1991`
42
    """
43
44 1
    stc = SynonameToolcode()
45
46 1
    test_dict = {
47
        val: 2 ** n
48
        for n, val in enumerate(
49
            (
50
                'exact',
51
                'omission',
52
                'substitution',
53
                'transposition',
54
                'punctuation',
55
                'initials',
56
                'extension',
57
                'inclusion',
58
                'no_first',
59
                'word_approx',
60
                'confusions',
61
                'char_approx',
62
            )
63
        )
64
    }
65 1
    match_name = (
66
        '',
67
        'exact',
68
        'omission',
69
        'substitution',
70
        'transposition',
71
        'punctuation',
72
        'initials',
73
        'extension',
74
        'inclusion',
75
        'no_first',
76
        'word_approx',
77
        'confusions',
78
        'char_approx',
79
        'no_match',
80
    )
81 1
    match_type_dict = {val: n for n, val in enumerate(match_name)}
82
83 1
    def _synoname_strip_punct(self, word):
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
84
        """Return a word with punctuation stripped out.
85
86
        Args:
87
            word (str): a word to strip punctuation from
88
89
        Returns:
90
            str: The word stripped of punctuation
91
92
        Examples:
93
            >>> pe = Synoname()
94
            >>> pe._synoname_strip_punct('AB;CD EF-GH$IJ')
95
            'ABCD EFGHIJ'
96
97
        """
98 1
        stripped = ''
99 1
        for char in word:
100 1
            if char not in set(',-./:;"&\'()!{|}?$%*+<=>[\\]^_`~'):
101 1
                stripped += char
102 1
        return stripped.strip()
103
104 1
    def _synoname_word_approximation(
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
Comprehensibility introduced by
This function exceeds the maximum number of variables (32/15).
Loading history...
best-practice introduced by
Too many return statements (10/6)
Loading history...
105
        self, src_ln, tar_ln, src_fn='', tar_fn='', features=None
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
106
    ):
107
        """Return the Synoname word approximation score for two names.
108
109
        Args:
110
            src_ln (str): Last name of the source
111
            tar_ln (str): Last name of the target
112
            src_fn (str): First name of the source (optional)
113
            tar_fn (str): First name of the target (optional)
114
            features (dict): A dict containing special features calculated
115
                using :py:class:`fingerprint.SynonameToolcode` (optional)
116
117
        Returns:
118
            float: The word approximation score
119
120
        Examples:
121
            >>> pe = Synoname()
122
            >>> pe._synoname_word_approximation('Smith Waterman', 'Waterman',
123
            ... 'Tom Joe Bob', 'Tom Joe')
124
            0.6
125
126
        """
127 1
        if features is None:
128 1
            features = {}
129 1
        if 'src_specials' not in features:
130 1
            features['src_specials'] = []
131 1
        if 'tar_specials' not in features:
132 1
            features['tar_specials'] = []
133
134 1
        src_len_specials = len(features['src_specials'])
135 1
        tar_len_specials = len(features['tar_specials'])
136
137
        # 1
138 1
        if ('gen_conflict' in features and features['gen_conflict']) or (
139
            'roman_conflict' in features and features['roman_conflict']
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
140
        ):
141 1
            return 0
142
143
        # 3 & 7
144 1
        full_tar1 = ' '.join((tar_ln, tar_fn)).replace('-', ' ').strip()
145 1
        for s_pos, s_type in features['tar_specials']:
146 1
            if s_type == 'a':
147 1
                full_tar1 = full_tar1[
148
                    : -(1 + len(self.stc.synoname_special_table[s_pos][1]))
149
                ]
150 1
            elif s_type == 'b':
151 1
                loc = (
152
                    full_tar1.find(
153
                        ' ' + self.stc.synoname_special_table[s_pos][1] + ' '
154
                    )
155
                    + 1
156
                )
157 1
                full_tar1 = (
158
                    full_tar1[:loc]
159
                    + full_tar1[
160
                        loc + len(self.stc.synoname_special_table[s_pos][1]) :
161
                    ]
162
                )
163 1
            elif s_type == 'c':
164 1
                full_tar1 = full_tar1[
165
                    1 + len(self.stc.synoname_special_table[s_pos][1]) :
166
                ]
167
168 1
        full_src1 = ' '.join((src_ln, src_fn)).replace('-', ' ').strip()
169 1
        for s_pos, s_type in features['src_specials']:
170 1
            if s_type == 'a':
171 1
                full_src1 = full_src1[
172
                    : -(1 + len(self.stc.synoname_special_table[s_pos][1]))
173
                ]
174 1
            elif s_type == 'b':
175 1
                loc = (
176
                    full_src1.find(
177
                        ' ' + self.stc.synoname_special_table[s_pos][1] + ' '
178
                    )
179
                    + 1
180
                )
181 1
                full_src1 = (
182
                    full_src1[:loc]
183
                    + full_src1[
184
                        loc + len(self.stc.synoname_special_table[s_pos][1]) :
185
                    ]
186
                )
187 1
            elif s_type == 'c':
188 1
                full_src1 = full_src1[
189
                    1 + len(self.stc.synoname_special_table[s_pos][1]) :
190
                ]
191
192 1
        full_tar2 = full_tar1
193 1
        for s_pos, s_type in features['tar_specials']:
194 1
            if s_type == 'd':
195 1
                full_tar2 = full_tar2[
196
                    len(self.stc.synoname_special_table[s_pos][1]) :
197
                ]
198 1
            elif (
199
                s_type == 'X'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
200
                and self.stc.synoname_special_table[s_pos][1] in full_tar2
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
201
            ):
202 1
                loc = full_tar2.find(
203
                    ' ' + self.stc.synoname_special_table[s_pos][1]
204
                )
205 1
                full_tar2 = (
206
                    full_tar2[:loc]
207
                    + full_tar2[
208
                        loc + len(self.stc.synoname_special_table[s_pos][1]) :
209
                    ]
210
                )
211
212 1
        full_src2 = full_src1
213 1
        for s_pos, s_type in features['src_specials']:
214 1
            if s_type == 'd':
215 1
                full_src2 = full_src2[
216
                    len(self.stc.synoname_special_table[s_pos][1]) :
217
                ]
218 1
            elif (
219
                s_type == 'X'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
220
                and self.stc.synoname_special_table[s_pos][1] in full_src2
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
221
            ):
222 1
                loc = full_src2.find(
223
                    ' ' + self.stc.synoname_special_table[s_pos][1]
224
                )
225 1
                full_src2 = (
226
                    full_src2[:loc]
227
                    + full_src2[
228
                        loc + len(self.stc.synoname_special_table[s_pos][1]) :
229
                    ]
230
                )
231
232 1
        full_tar1 = self._synoname_strip_punct(full_tar1)
233 1
        tar1_words = full_tar1.split()
234 1
        tar1_num_words = len(tar1_words)
235
236 1
        full_src1 = self._synoname_strip_punct(full_src1)
237 1
        src1_words = full_src1.split()
238 1
        src1_num_words = len(src1_words)
239
240 1
        full_tar2 = self._synoname_strip_punct(full_tar2)
241 1
        tar2_words = full_tar2.split()
242 1
        tar2_num_words = len(tar2_words)
243
244 1
        full_src2 = self._synoname_strip_punct(full_src2)
245 1
        src2_words = full_src2.split()
246 1
        src2_num_words = len(src2_words)
247
248
        # 2
249 1
        if (
250
            src1_num_words < 2
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
251
            and src_len_specials == 0
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
252
            and src2_num_words < 2
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
253
            and tar_len_specials == 0
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
254
        ):
255 1
            return 0
256
257
        # 4
258 1
        if (
259
            tar1_num_words == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
260
            and src1_num_words == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
261
            and tar1_words[0] == src1_words[0]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
262
        ):
263 1
            return 1
264 1
        if tar1_num_words < 2 and tar_len_specials == 0:
265 1
            return 0
266
267
        # 5
268 1
        last_found = False
269 1
        for word in tar1_words:
270 1
            if src_ln.endswith(word) or word + ' ' in src_ln:
271 1
                last_found = True
272
273 1
        if not last_found:
274 1
            for word in src1_words:
275 1
                if tar_ln.endswith(word) or word + ' ' in tar_ln:
276 1
                    last_found = True
277
278
        # 6
279 1
        matches = 0
280 1
        if last_found:
281 1
            for i, s_word in enumerate(src1_words):
282 1
                for j, t_word in enumerate(tar1_words):
283 1
                    if s_word == t_word:
284 1
                        src1_words[i] = '@'
285 1
                        tar1_words[j] = '@'
286 1
                        matches += 1
287 1
        w_ratio = matches / max(tar1_num_words, src1_num_words)
288 1
        if matches > 1 or (
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (6/5)
Loading history...
289
            matches == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
290
            and src1_num_words == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
291
            and tar1_num_words == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
292
            and (tar_len_specials > 0 or src_len_specials > 0)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
293
        ):
294 1
            return w_ratio
295
296
        # 8
297 1
        if (
298
            tar2_num_words == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
299
            and src2_num_words == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
300
            and tar2_words[0] == src2_words[0]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
301
        ):
302 1
            return 1
303
        # I see no way that the following can be True if the equivalent in
304
        # #4 was False.
305
        if tar2_num_words < 2 and tar_len_specials == 0:  # pragma: no cover
306
            return 0
307
308
        # 9
309 1
        last_found = False
310 1
        for word in tar2_words:
311 1
            if src_ln.endswith(word) or word + ' ' in src_ln:
312 1
                last_found = True
313
314 1
        if not last_found:
315 1
            for word in src2_words:
316 1
                if tar_ln.endswith(word) or word + ' ' in tar_ln:
317 1
                    last_found = True
318
319 1
        if not last_found:
320 1
            return 0
321
322
        # 10
323 1
        matches = 0
324 1
        if last_found:
325 1
            for i, s_word in enumerate(src2_words):
326 1
                for j, t_word in enumerate(tar2_words):
327 1
                    if s_word == t_word:
328 1
                        src2_words[i] = '@'
329 1
                        tar2_words[j] = '@'
330 1
                        matches += 1
331 1
        w_ratio = matches / max(tar2_num_words, src2_num_words)
332 1
        if matches > 1 or (
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (6/5)
Loading history...
333
            matches == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
334
            and src2_num_words == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
335
            and tar2_num_words == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
336
            and (tar_len_specials > 0 or src_len_specials > 0)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
337
        ):
338
            return w_ratio
339
340 1
        return 0
341
342 1
    def dist_abs(
0 ignored issues
show
best-practice introduced by
Too many arguments (7/5)
Loading history...
Comprehensibility introduced by
This function exceeds the maximum number of variables (44/15).
Loading history...
Bug introduced by
Parameters differ from overridden 'dist_abs' method
Loading history...
best-practice introduced by
Too many return statements (18/6)
Loading history...
343
        self,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
344
        src,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
345
        tar,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
346
        word_approx_min=0.3,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
347
        char_approx_min=0.73,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
348
        tests=2 ** 12 - 1,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
349
        ret_name=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
350
    ):
351
        """Return the Synoname similarity type of two words.
352
353
        Args:
354
            src (str): Source string for comparison
355
            tar (str): Target string for comparison
356
            word_approx_min (float): the minimum word approximation value to
357
                signal a 'word_approx' match
358
            char_approx_min (float): the minimum character approximation value
359
                to signal a 'char_approx' match
360
            tests (int or Iterable): either an integer indicating tests to
361
                perform or a list of test names to perform (defaults to
362
                performing all tests)
363
            ret_name (bool): If True, returns the match name rather than its
364
                integer equivalent
365
366
        Returns:
367
            int (or str if ret_name is True): Synoname value
368
369
        Examples:
370
            >>> synoname(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''))
371
            2
372
            >>> synoname(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''),
373
            ... ret_name=True)
374
            'omission'
375
            >>> synoname(('Dore', 'Gustave', ''),
376
            ... ('Dore', 'Paul Gustave Louis Christophe', ''),
377
            ... ret_name=True)
378
            'inclusion'
379
            >>> synoname(('Pereira', 'I. R.', ''), ('Pereira', 'I. Smith', ''),
380
            ... ret_name=True)
381
            'word_approx'
382
383
        """
384 1
        if isinstance(tests, Iterable):
385 1
            new_tests = 0
386 1
            for term in tests:
387 1
                if term in self.test_dict:
388 1
                    new_tests += self.test_dict[term]
389 1
            tests = new_tests
390
391 1
        if isinstance(src, tuple):
392 1
            src_ln, src_fn, src_qual = src
393 1
        elif '#' in src:
394 1
            src_ln, src_fn, src_qual = src.split('#')[-3:]
395
        else:
396 1
            src_ln, src_fn, src_qual = src, '', ''
397
398 1
        if isinstance(tar, tuple):
399 1
            tar_ln, tar_fn, tar_qual = tar
400 1
        elif '#' in tar:
401 1
            tar_ln, tar_fn, tar_qual = tar.split('#')[-3:]
402
        else:
403 1
            tar_ln, tar_fn, tar_qual = tar, '', ''
404
405 1
        def _split_special(spec):
406 1
            spec_list = []
407 1
            while spec:
408 1
                spec_list.append((int(spec[:3]), spec[3:4]))
409 1
                spec = spec[4:]
410 1
            return spec_list
411
412 1
        def _fmt_retval(val):
413 1
            if ret_name:
414 1
                return self.match_name[val]
415 1
            return val
416
417
        # 1. Preprocessing
418
419
        # Lowercasing
420 1
        src_fn = src_fn.strip().lower()
421 1
        src_ln = src_ln.strip().lower()
422 1
        src_qual = src_qual.strip().lower()
423
424 1
        tar_fn = tar_fn.strip().lower()
425 1
        tar_ln = tar_ln.strip().lower()
426 1
        tar_qual = tar_qual.strip().lower()
427
428
        # Create toolcodes
429 1
        src_ln, src_fn, src_tc = self.stc.fingerprint(src_ln, src_fn, src_qual)
430 1
        tar_ln, tar_fn, tar_tc = self.stc.fingerprint(tar_ln, tar_fn, tar_qual)
431
432 1
        src_generation = int(src_tc[2])
433 1
        src_romancode = int(src_tc[3:6])
434 1
        src_len_fn = int(src_tc[6:8])
435 1
        src_tc = src_tc.split('$')
436 1
        src_specials = _split_special(src_tc[1])
437
438 1
        tar_generation = int(tar_tc[2])
439 1
        tar_romancode = int(tar_tc[3:6])
440 1
        tar_len_fn = int(tar_tc[6:8])
441 1
        tar_tc = tar_tc.split('$')
442 1
        tar_specials = _split_special(tar_tc[1])
443
444 1
        gen_conflict = (src_generation != tar_generation) and bool(
445
            src_generation or tar_generation
446
        )
447 1
        roman_conflict = (src_romancode != tar_romancode) and bool(
448
            src_romancode or tar_romancode
449
        )
450
451 1
        ln_equal = src_ln == tar_ln
452 1
        fn_equal = src_fn == tar_fn
453
454
        # approx_c
455 1
        def _approx_c():
456 1
            if gen_conflict or roman_conflict:
457 1
                return False, 0
458
459 1
            full_src = ' '.join((src_ln, src_fn))
460 1
            if full_src.startswith('master '):
461 1
                full_src = full_src[len('master ') :]
462 1
                for intro in [
463
                    'of the ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
464
                    'of ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
465
                    'known as the ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
466
                    'with the ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
467
                    'with ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
468
                ]:
469 1
                    if full_src.startswith(intro):
470 1
                        full_src = full_src[len(intro) :]
471
472 1
            full_tar = ' '.join((tar_ln, tar_fn))
473 1
            if full_tar.startswith('master '):
474 1
                full_tar = full_tar[len('master ') :]
475 1
                for intro in [
476
                    'of the ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
477
                    'of ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
478
                    'known as the ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
479
                    'with the ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
480
                    'with ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
481
                ]:
482 1
                    if full_tar.startswith(intro):
483 1
                        full_tar = full_tar[len(intro) :]
484
485 1
            loc_ratio = sim_ratcliff_obershelp(full_src, full_tar)
486 1
            return loc_ratio >= char_approx_min, loc_ratio
487
488 1
        approx_c_result, ca_ratio = _approx_c()
0 ignored issues
show
Unused Code introduced by
The variable approx_c_result seems to be unused.
Loading history...
489
490 1
        if tests & self.test_dict['exact'] and fn_equal and ln_equal:
491 1
            return _fmt_retval(self.match_type_dict['exact'])
492 1
        if tests & self.test_dict['omission']:
493 1 View Code Duplication
            if (
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
494
                fn_equal
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
495
                and levenshtein(src_ln, tar_ln, cost=(1, 1, 99, 99)) == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
496
            ):
497 1
                if not roman_conflict:
498 1
                    return _fmt_retval(self.match_type_dict['omission'])
499 1
            elif (
500
                ln_equal
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
501
                and levenshtein(src_fn, tar_fn, cost=(1, 1, 99, 99)) == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
502
            ):
503 1
                return _fmt_retval(self.match_type_dict['omission'])
504 1 View Code Duplication
        if tests & self.test_dict['substitution']:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
505 1
            if (
506
                fn_equal
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
507
                and levenshtein(src_ln, tar_ln, cost=(99, 99, 1, 99)) == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
508
            ):
509 1
                return _fmt_retval(self.match_type_dict['substitution'])
510 1
            elif (
511
                ln_equal
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
512
                and levenshtein(src_fn, tar_fn, cost=(99, 99, 1, 99)) == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
513
            ):
514 1
                return _fmt_retval(self.match_type_dict['substitution'])
515 1 View Code Duplication
        if tests & self.test_dict['transposition']:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
516 1
            if fn_equal and (
517
                levenshtein(src_ln, tar_ln, mode='osa', cost=(99, 99, 99, 1))
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
518
                == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
519
            ):
520 1
                return _fmt_retval(self.match_type_dict['transposition'])
521 1
            elif ln_equal and (
522
                levenshtein(src_fn, tar_fn, mode='osa', cost=(99, 99, 99, 1))
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
523
                == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
524
            ):
525 1
                return _fmt_retval(self.match_type_dict['transposition'])
526 1
        if tests & self.test_dict['punctuation']:
527 1
            np_src_fn = self._synoname_strip_punct(src_fn)
528 1
            np_tar_fn = self._synoname_strip_punct(tar_fn)
529 1
            np_src_ln = self._synoname_strip_punct(src_ln)
530 1
            np_tar_ln = self._synoname_strip_punct(tar_ln)
531
532 1
            if (np_src_fn == np_tar_fn) and (np_src_ln == np_tar_ln):
533 1
                return _fmt_retval(self.match_type_dict['punctuation'])
534
535 1
            np_src_fn = self._synoname_strip_punct(src_fn.replace('-', ' '))
536 1
            np_tar_fn = self._synoname_strip_punct(tar_fn.replace('-', ' '))
537 1
            np_src_ln = self._synoname_strip_punct(src_ln.replace('-', ' '))
538 1
            np_tar_ln = self._synoname_strip_punct(tar_ln.replace('-', ' '))
539
540 1
            if (np_src_fn == np_tar_fn) and (np_src_ln == np_tar_ln):
541 1
                return _fmt_retval(self.match_type_dict['punctuation'])
542
543 1
        if tests & self.test_dict['initials'] and ln_equal:
544 1
            if src_fn and tar_fn:
545 1
                src_initials = self._synoname_strip_punct(src_fn).split()
546 1
                tar_initials = self._synoname_strip_punct(tar_fn).split()
547 1
                initials = bool(
548
                    (len(src_initials) == len(''.join(src_initials)))
549
                    or (len(tar_initials) == len(''.join(tar_initials)))
550
                )
551 1
                if initials:
552 1
                    src_initials = ''.join(_[0] for _ in src_initials)
553 1
                    tar_initials = ''.join(_[0] for _ in tar_initials)
554 1
                    if src_initials == tar_initials:
555 1
                        return _fmt_retval(self.match_type_dict['initials'])
556 1
                    initial_diff = abs(len(src_initials) - len(tar_initials))
557 1
                    if initial_diff and (
558
                        (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
559
                            initial_diff
560
                            == levenshtein(
561
                                src_initials,
562
                                tar_initials,
563
                                cost=(1, 99, 99, 99),
564
                            )
565
                        )
566
                        or (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
567
                            initial_diff
568
                            == levenshtein(
569
                                tar_initials,
570
                                src_initials,
571
                                cost=(1, 99, 99, 99),
572
                            )
573
                        )
574
                    ):
575 1
                        return _fmt_retval(self.match_type_dict['initials'])
576 1
        if tests & self.test_dict['extension']:
577 1
            if src_ln[1] == tar_ln[1] and (
578
                src_ln.startswith(tar_ln) or tar_ln.startswith(src_ln)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
579
            ):
580 1
                if (
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (7/5)
Loading history...
581
                    (not src_len_fn and not tar_len_fn)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
582
                    or (tar_fn and src_fn.startswith(tar_fn))
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
583
                    or (src_fn and tar_fn.startswith(src_fn))
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
584
                ) and not roman_conflict:
585 1
                    return _fmt_retval(self.match_type_dict['extension'])
586 1
        if tests & self.test_dict['inclusion'] and ln_equal:
587 1
            if (src_fn and src_fn in tar_fn) or (tar_fn and tar_fn in src_ln):
588 1
                return _fmt_retval(self.match_type_dict['inclusion'])
589 1
        if tests & self.test_dict['no_first'] and ln_equal:
590 1
            if src_fn == '' or tar_fn == '':
591 1
                return _fmt_retval(self.match_type_dict['no_first'])
592 1
        if tests & self.test_dict['word_approx']:
593 1
            ratio = self._synoname_word_approximation(
594
                src_ln,
595
                tar_ln,
596
                src_fn,
597
                tar_fn,
598
                {
599
                    'gen_conflict': gen_conflict,
600
                    'roman_conflict': roman_conflict,
601
                    'src_specials': src_specials,
602
                    'tar_specials': tar_specials,
603
                },
604
            )
605 1
            if ratio == 1 and tests & self.test_dict['confusions']:
606 1
                if (
607
                    ' '.join((src_fn, src_ln)).strip()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
608
                    == ' '.join((tar_fn, tar_ln)).strip()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
609
                ):
610 1
                    return _fmt_retval(self.match_type_dict['confusions'])
611 1
            if ratio >= word_approx_min:
612 1
                return _fmt_retval(self.match_type_dict['word_approx'])
613 1
        if tests & self.test_dict['char_approx']:
614 1
            if ca_ratio >= char_approx_min:
615 1
                return _fmt_retval(self.match_type_dict['char_approx'])
616 1
        return _fmt_retval(self.match_type_dict['no_match'])
617
618 1
    def dist(
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
Bug introduced by
Parameters differ from overridden 'dist' method
Loading history...
619
        self,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
620
        src,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
621
        tar,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
622
        word_approx_min=0.3,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
623
        char_approx_min=0.73,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
624
        tests=2 ** 12 - 1,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
625
    ):
626
        """Return the normalized Synoname distance between two words.
627
628
        Args:
629
            src (str): Source string for comparison
630
            tar (str): Target string for comparison
631
            word_approx_min (float): the minimum word approximation value to
632
                signal a 'word_approx' match
633
            char_approx_min (float): the minimum character approximation value
634
                to signal a 'char_approx' match
635
            tests (int or Iterable): either an integer indicating tests to
636
                perform or a list of test names to perform (defaults to
637
                performing all tests)
638
639
        Returns:
640
            float: Normalized Synoname distance
641
642
        """
643
        return (
644
            synoname(src, tar, word_approx_min, char_approx_min, tests, False)
645
            / 14
646
        )
647
648
649 1
def synoname(
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
650
    src,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
651
    tar,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
652
    word_approx_min=0.3,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
653
    char_approx_min=0.73,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
654
    tests=2 ** 12 - 1,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
655
    ret_name=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
656
):
657
    """Return the Synoname similarity type of two words.
658
659
    This is a wrapper for :py:meth:`Synoname.synoname`.
660
661
    Args:
662
        src (str): Source string for comparison
663
        tar (str): Target string for comparison
664
        word_approx_min (float): the minimum word approximation value to signal
665
            a 'word_approx' match
666
        char_approx_min (float): the minimum character approximation value to
667
            signal a 'char_approx' match
668
        tests (int or Iterable): either an integer indicating tests to perform
669
            or a list of test names to perform (defaults to performing all
670
            tests)
671
        ret_name (bool): If True, returns the match name rather than its
672
            integer equivalent
673
674
    Returns:
675
        int (or str if ret_name is True): Synoname value
676
677
    Examples:
678
        >>> synoname(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''))
679
        2
680
        >>> synoname(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''),
681
        ... ret_name=True)
682
        'omission'
683
        >>> synoname(('Dore', 'Gustave', ''),
684
        ... ('Dore', 'Paul Gustave Louis Christophe', ''),
685
        ... ret_name=True)
686
        'inclusion'
687
        >>> synoname(('Pereira', 'I. R.', ''), ('Pereira', 'I. Smith', ''),
688
        ... ret_name=True)
689
        'word_approx'
690
691
    """
692 1
    return Synoname().dist_abs(
693
        src, tar, word_approx_min, char_approx_min, tests, ret_name
694
    )
695
696
697
if __name__ == '__main__':
698
    import doctest
699
700
    doctest.testmod()
701