Completed
Push — master ( f43547...71985b )
by Chris
12:00 queued 10s
created

abydos.distance._synoname.Synoname.dist()   A

Complexity

Conditions 1

Size

Total Lines 35
Code Lines 10

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 10
nop 6
dl 0
loc 35
ccs 2
cts 2
cp 1
crap 1
rs 9.9
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._synoname.
20
21
Synoname.
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from collections import Iterable
32
33 1
from ._distance import _Distance
34 1
from ._levenshtein import levenshtein
35 1
from ._ratcliff_obershelp import sim_ratcliff_obershelp
36
37
# noinspection PyProtectedMember
38 1
from ..fingerprint._synoname import SynonameToolcode
39
40 1
__all__ = ['Synoname', 'synoname']
41
42
43 1
class Synoname(_Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
44
    """Synoname.
45
46
    Cf. :cite:`Getty:1991,Gross:1991`
47
    """
48
49 1
    _stc = SynonameToolcode()
50
51 1
    _test_dict = {
52
        val: 2 ** n
53
        for n, val in enumerate(
54
            (
55
                'exact',
56
                'omission',
57
                'substitution',
58
                'transposition',
59
                'punctuation',
60
                'initials',
61
                'extension',
62
                'inclusion',
63
                'no_first',
64
                'word_approx',
65
                'confusions',
66
                'char_approx',
67
            )
68
        )
69
    }
70 1
    _match_name = (
71
        '',
72
        'exact',
73
        'omission',
74
        'substitution',
75
        'transposition',
76
        'punctuation',
77
        'initials',
78
        'extension',
79
        'inclusion',
80
        'no_first',
81
        'word_approx',
82
        'confusions',
83
        'char_approx',
84
        'no_match',
85
    )
86 1
    _match_type_dict = {val: n for n, val in enumerate(_match_name)}
87
88 1
    def _synoname_strip_punct(self, word):
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
89
        """Return a word with punctuation stripped out.
90
91
        Parameters
92
        ----------
93
        word : str
94
            A word to strip punctuation from
95
96
        Returns
97
        -------
98
        str
99
            The word stripped of punctuation
100
101
        Examples
102
        --------
103
        >>> pe = Synoname()
104
        >>> pe._synoname_strip_punct('AB;CD EF-GH$IJ')
105
        'ABCD EFGHIJ'
106
107
        """
108 1
        stripped = ''
109 1
        for char in word:
110 1
            if char not in set(',-./:;"&\'()!{|}?$%*+<=>[\\]^_`~'):
111 1
                stripped += char
112 1
        return stripped.strip()
113
114 1
    def _synoname_word_approximation(
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
Comprehensibility introduced by
This function exceeds the maximum number of variables (32/15).
Loading history...
best-practice introduced by
Too many return statements (10/6)
Loading history...
115
        self, src_ln, tar_ln, src_fn='', tar_fn='', features=None
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
116
    ):
117
        """Return the Synoname word approximation score for two names.
118
119
        Parameters
120
        ----------
121
        src_ln : str
122
            Last name of the source
123
        tar_ln : str
124
            Last name of the target
125
        src_fn : str
126
            First name of the source (optional)
127
        tar_fn : str
128
            First name of the target (optional)
129
        features : dict
130
            A dict containing special features calculated using
131
            :py:class:`fingerprint.SynonameToolcode` (optional)
132
133
        Returns
134
        -------
135
        float
136
            The word approximation score
137
138
        Examples
139
        --------
140
        >>> pe = Synoname()
141
        >>> pe._synoname_word_approximation('Smith Waterman', 'Waterman',
142
        ... 'Tom Joe Bob', 'Tom Joe')
143
        0.6
144
145
        """
146 1
        if features is None:
147 1
            features = {}
148 1
        if 'src_specials' not in features:
149 1
            features['src_specials'] = []
150 1
        if 'tar_specials' not in features:
151 1
            features['tar_specials'] = []
152
153 1
        src_len_specials = len(features['src_specials'])
154 1
        tar_len_specials = len(features['tar_specials'])
155
156
        # 1
157 1
        if ('gen_conflict' in features and features['gen_conflict']) or (
158
            'roman_conflict' in features and features['roman_conflict']
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
159
        ):
160 1
            return 0
161
162
        # 3 & 7
163 1
        full_tar1 = ' '.join((tar_ln, tar_fn)).replace('-', ' ').strip()
164 1
        for s_pos, s_type in features['tar_specials']:
165 1
            if s_type == 'a':
166 1
                full_tar1 = full_tar1[
167
                    : -(
168
                        1
169
                        + len(
170
                            self._stc._synoname_special_table[  # noqa: SF01
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _synoname_special_table was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
171
                                s_pos
172
                            ][1]
173
                        )
174
                    )
175
                ]
176 1
            elif s_type == 'b':
177 1
                loc = (
178
                    full_tar1.find(
179
                        ' '
180
                        + self._stc._synoname_special_table[  # noqa: SF01
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _synoname_special_table was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
181
                            s_pos
182
                        ][1]
183
                        + ' '
184
                    )
185
                    + 1
186
                )
187 1
                full_tar1 = (
188
                    full_tar1[:loc]
189
                    + full_tar1[
190
                        loc
191
                        + len(
192
                            self._stc._synoname_special_table[  # noqa: SF01
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _synoname_special_table was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
193
                                s_pos
194
                            ][1]
195
                        ) :
196
                    ]
197
                )
198 1
            elif s_type == 'c':
199 1
                full_tar1 = full_tar1[
200
                    1
201
                    + len(
202
                        self._stc._synoname_special_table[s_pos][  # noqa: SF01
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _synoname_special_table was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
203
                            1
204
                        ]
205
                    ) :
206
                ]
207
208 1
        full_src1 = ' '.join((src_ln, src_fn)).replace('-', ' ').strip()
209 1
        for s_pos, s_type in features['src_specials']:
210 1
            if s_type == 'a':
211 1
                full_src1 = full_src1[
212
                    : -(
213
                        1
214
                        + len(
215
                            self._stc._synoname_special_table[  # noqa: SF01
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _synoname_special_table was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
216
                                s_pos
217
                            ][1]
218
                        )
219
                    )
220
                ]
221 1
            elif s_type == 'b':
222 1
                loc = (
223
                    full_src1.find(
224
                        ' '
225
                        + self._stc._synoname_special_table[  # noqa: SF01
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _synoname_special_table was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
226
                            s_pos
227
                        ][1]
228
                        + ' '
229
                    )
230
                    + 1
231
                )
232 1
                full_src1 = (
233
                    full_src1[:loc]
234
                    + full_src1[
235
                        loc
236
                        + len(
237
                            self._stc._synoname_special_table[  # noqa: SF01
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _synoname_special_table was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
238
                                s_pos
239
                            ][1]
240
                        ) :
241
                    ]
242
                )
243 1
            elif s_type == 'c':
244 1
                full_src1 = full_src1[
245
                    1
246
                    + len(
247
                        self._stc._synoname_special_table[s_pos][  # noqa: SF01
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _synoname_special_table was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
248
                            1
249
                        ]
250
                    ) :
251
                ]
252
253 1
        full_tar2 = full_tar1
254 1
        for s_pos, s_type in features['tar_specials']:
255 1
            if s_type == 'd':
256 1
                full_tar2 = full_tar2[
257
                    len(
258
                        self._stc._synoname_special_table[s_pos][  # noqa: SF01
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _synoname_special_table was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
259
                            1
260
                        ]
261
                    ) :
262
                ]
263 1
            elif (
264
                s_type == 'X'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
265
                and self._stc._synoname_special_table[s_pos][1]  # noqa: SF01
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
Coding Style Best Practice introduced by
It seems like _synoname_special_table was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
266
                in full_tar2
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
267
            ):
268 1
                loc = full_tar2.find(
269
                    ' '
270
                    + self._stc._synoname_special_table[s_pos][1]  # noqa: SF01
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _synoname_special_table was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
271
                )
272 1
                full_tar2 = (
273
                    full_tar2[:loc]
274
                    + full_tar2[
275
                        loc
276
                        + len(
277
                            self._stc._synoname_special_table[  # noqa: SF01
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _synoname_special_table was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
278
                                s_pos
279
                            ][1]
280
                        ) :
281
                    ]
282
                )
283
284 1
        full_src2 = full_src1
285 1
        for s_pos, s_type in features['src_specials']:
286 1
            if s_type == 'd':
287 1
                full_src2 = full_src2[
288
                    len(
289
                        self._stc._synoname_special_table[s_pos][  # noqa: SF01
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _synoname_special_table was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
290
                            1
291
                        ]
292
                    ) :
293
                ]
294 1
            elif (
295
                s_type == 'X'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
296
                and self._stc._synoname_special_table[s_pos][1]  # noqa: SF01
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
Coding Style Best Practice introduced by
It seems like _synoname_special_table was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
297
                in full_src2
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
298
            ):
299 1
                loc = full_src2.find(
300
                    ' '
301
                    + self._stc._synoname_special_table[s_pos][1]  # noqa: SF01
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _synoname_special_table was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
302
                )
303 1
                full_src2 = (
304
                    full_src2[:loc]
305
                    + full_src2[
306
                        loc
307
                        + len(
308
                            self._stc._synoname_special_table[  # noqa: SF01
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _synoname_special_table was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
309
                                s_pos
310
                            ][1]
311
                        ) :
312
                    ]
313
                )
314
315 1
        full_tar1 = self._synoname_strip_punct(full_tar1)
316 1
        tar1_words = full_tar1.split()
317 1
        tar1_num_words = len(tar1_words)
318
319 1
        full_src1 = self._synoname_strip_punct(full_src1)
320 1
        src1_words = full_src1.split()
321 1
        src1_num_words = len(src1_words)
322
323 1
        full_tar2 = self._synoname_strip_punct(full_tar2)
324 1
        tar2_words = full_tar2.split()
325 1
        tar2_num_words = len(tar2_words)
326
327 1
        full_src2 = self._synoname_strip_punct(full_src2)
328 1
        src2_words = full_src2.split()
329 1
        src2_num_words = len(src2_words)
330
331
        # 2
332 1
        if (
333
            src1_num_words < 2
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
334
            and src_len_specials == 0
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
335
            and src2_num_words < 2
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
336
            and tar_len_specials == 0
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
337
        ):
338 1
            return 0
339
340
        # 4
341 1
        if (
342
            tar1_num_words == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
343
            and src1_num_words == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
344
            and tar1_words[0] == src1_words[0]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
345
        ):
346 1
            return 1
347 1
        if tar1_num_words < 2 and tar_len_specials == 0:
348 1
            return 0
349
350
        # 5
351 1
        last_found = False
352 1
        for word in tar1_words:
353 1
            if src_ln.endswith(word) or word + ' ' in src_ln:
354 1
                last_found = True
355
356 1
        if not last_found:
357 1
            for word in src1_words:
358 1
                if tar_ln.endswith(word) or word + ' ' in tar_ln:
359 1
                    last_found = True
360
361
        # 6
362 1
        matches = 0
363 1
        if last_found:
364 1
            for i, s_word in enumerate(src1_words):
365 1
                for j, t_word in enumerate(tar1_words):
366 1
                    if s_word == t_word:
367 1
                        src1_words[i] = '@'
368 1
                        tar1_words[j] = '@'
369 1
                        matches += 1
370 1
        w_ratio = matches / max(tar1_num_words, src1_num_words)
371 1
        if matches > 1 or (
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (6/5)
Loading history...
372
            matches == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
373
            and src1_num_words == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
374
            and tar1_num_words == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
375
            and (tar_len_specials > 0 or src_len_specials > 0)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
376
        ):
377 1
            return w_ratio
378
379
        # 8
380 1
        if (
381
            tar2_num_words == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
382
            and src2_num_words == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
383
            and tar2_words[0] == src2_words[0]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
384
        ):
385 1
            return 1
386
        # I see no way that the following can be True if the equivalent in
387
        # #4 was False.
388
        if tar2_num_words < 2 and tar_len_specials == 0:  # pragma: no cover
389
            return 0
390
391
        # 9
392 1
        last_found = False
393 1
        for word in tar2_words:
394 1
            if src_ln.endswith(word) or word + ' ' in src_ln:
395 1
                last_found = True
396
397 1
        if not last_found:
398 1
            for word in src2_words:
399 1
                if tar_ln.endswith(word) or word + ' ' in tar_ln:
400 1
                    last_found = True
401
402 1
        if not last_found:
403 1
            return 0
404
405
        # 10
406 1
        matches = 0
407 1
        if last_found:
408 1
            for i, s_word in enumerate(src2_words):
409 1
                for j, t_word in enumerate(tar2_words):
410 1
                    if s_word == t_word:
411 1
                        src2_words[i] = '@'
412 1
                        tar2_words[j] = '@'
413 1
                        matches += 1
414 1
        w_ratio = matches / max(tar2_num_words, src2_num_words)
415 1
        if matches > 1 or (
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (6/5)
Loading history...
416
            matches == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
417
            and src2_num_words == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
418
            and tar2_num_words == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
419
            and (tar_len_specials > 0 or src_len_specials > 0)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
420
        ):
421
            return w_ratio
422
423 1
        return 0
424
425 1
    def dist_abs(
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist_abs' method
Loading history...
best-practice introduced by
Too many arguments (7/5)
Loading history...
Comprehensibility introduced by
This function exceeds the maximum number of variables (44/15).
Loading history...
best-practice introduced by
Too many return statements (18/6)
Loading history...
426
        self,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
427
        src,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
428
        tar,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
429
        word_approx_min=0.3,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
430
        char_approx_min=0.73,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
431
        tests=2 ** 12 - 1,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
432
        ret_name=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
433
    ):
434
        """Return the Synoname similarity type of two words.
435
436
        Parameters
437
        ----------
438
        src : str
439
            Source string for comparison
440
        tar : str
441
            Target string for comparison
442
        word_approx_min : float
443
            The minimum word approximation value to signal a 'word_approx'
444
            match
445
        char_approx_min : float
446
            The minimum character approximation value to signal a 'char_approx'
447
            match
448
        tests : int or Iterable
449
            Either an integer indicating tests to perform or a list of test
450
            names to perform (defaults to performing all tests)
451
        ret_name : bool
452
            If True, returns the match name rather than its integer equivalent
453
454
        Returns
455
        -------
456
        int (or str if ret_name is True)
457
            Synoname value
458
459
        Examples
460
        --------
461
        >>> cmp = Synoname()
462
        >>> cmp.dist_abs(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''))
463
        2
464
        >>> cmp.dist_abs(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''),
465
        ... ret_name=True)
466
        'omission'
467
        >>> cmp.dist_abs(('Dore', 'Gustave', ''),
468
        ... ('Dore', 'Paul Gustave Louis Christophe', ''), ret_name=True)
469
        'inclusion'
470
        >>> cmp.dist_abs(('Pereira', 'I. R.', ''), ('Pereira', 'I. Smith', ''),
471
        ... ret_name=True)
472
        'word_approx'
473
474
        """
475 1
        if isinstance(tests, Iterable):
476 1
            new_tests = 0
477 1
            for term in tests:
478 1
                if term in self._test_dict:
479 1
                    new_tests += self._test_dict[term]
480 1
            tests = new_tests
481
482 1
        if isinstance(src, tuple):
483 1
            src_ln, src_fn, src_qual = src
484 1
        elif '#' in src:
485 1
            src_ln, src_fn, src_qual = src.split('#')[-3:]
486
        else:
487 1
            src_ln, src_fn, src_qual = src, '', ''
488
489 1
        if isinstance(tar, tuple):
490 1
            tar_ln, tar_fn, tar_qual = tar
491 1
        elif '#' in tar:
492 1
            tar_ln, tar_fn, tar_qual = tar.split('#')[-3:]
493
        else:
494 1
            tar_ln, tar_fn, tar_qual = tar, '', ''
495
496 1
        def _split_special(spec):
497 1
            spec_list = []
498 1
            while spec:
499 1
                spec_list.append((int(spec[:3]), spec[3:4]))
500 1
                spec = spec[4:]
501 1
            return spec_list
502
503 1
        def _fmt_retval(val):
504 1
            if ret_name:
505 1
                return self._match_name[val]
506 1
            return val
507
508
        # 1. Preprocessing
509
510
        # Lowercasing
511 1
        src_fn = src_fn.strip().lower()
512 1
        src_ln = src_ln.strip().lower()
513 1
        src_qual = src_qual.strip().lower()
514
515 1
        tar_fn = tar_fn.strip().lower()
516 1
        tar_ln = tar_ln.strip().lower()
517 1
        tar_qual = tar_qual.strip().lower()
518
519
        # Create toolcodes
520 1
        src_ln, src_fn, src_tc = self._stc.fingerprint(
521
            src_ln, src_fn, src_qual
522
        )
523 1
        tar_ln, tar_fn, tar_tc = self._stc.fingerprint(
524
            tar_ln, tar_fn, tar_qual
525
        )
526
527 1
        src_generation = int(src_tc[2])
528 1
        src_romancode = int(src_tc[3:6])
529 1
        src_len_fn = int(src_tc[6:8])
530 1
        src_tc = src_tc.split('$')
531 1
        src_specials = _split_special(src_tc[1])
532
533 1
        tar_generation = int(tar_tc[2])
534 1
        tar_romancode = int(tar_tc[3:6])
535 1
        tar_len_fn = int(tar_tc[6:8])
536 1
        tar_tc = tar_tc.split('$')
537 1
        tar_specials = _split_special(tar_tc[1])
538
539 1
        gen_conflict = (src_generation != tar_generation) and bool(
540
            src_generation or tar_generation
541
        )
542 1
        roman_conflict = (src_romancode != tar_romancode) and bool(
543
            src_romancode or tar_romancode
544
        )
545
546 1
        ln_equal = src_ln == tar_ln
547 1
        fn_equal = src_fn == tar_fn
548
549
        # approx_c
550 1
        def _approx_c():
551 1
            if gen_conflict or roman_conflict:
552 1
                return False, 0
553
554 1
            full_src = ' '.join((src_ln, src_fn))
555 1
            if full_src.startswith('master '):
556 1
                full_src = full_src[len('master ') :]
557 1
                for intro in [
558
                    'of the ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
559
                    'of ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
560
                    'known as the ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
561
                    'with the ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
562
                    'with ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
563
                ]:
564 1
                    if full_src.startswith(intro):
565 1
                        full_src = full_src[len(intro) :]
566
567 1
            full_tar = ' '.join((tar_ln, tar_fn))
568 1
            if full_tar.startswith('master '):
569 1
                full_tar = full_tar[len('master ') :]
570 1
                for intro in [
571
                    'of the ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
572
                    'of ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
573
                    'known as the ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
574
                    'with the ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
575
                    'with ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
576
                ]:
577 1
                    if full_tar.startswith(intro):
578 1
                        full_tar = full_tar[len(intro) :]
579
580 1
            loc_ratio = sim_ratcliff_obershelp(full_src, full_tar)
581 1
            return loc_ratio >= char_approx_min, loc_ratio
582
583 1
        approx_c_result, ca_ratio = _approx_c()
0 ignored issues
show
Unused Code introduced by
The variable approx_c_result seems to be unused.
Loading history...
584
585 1
        if tests & self._test_dict['exact'] and fn_equal and ln_equal:
586 1
            return _fmt_retval(self._match_type_dict['exact'])
587 1 View Code Duplication
        if tests & self._test_dict['omission']:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
588 1
            if (
589
                fn_equal
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
590
                and levenshtein(src_ln, tar_ln, cost=(1, 1, 99, 99)) == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
591
            ):
592 1
                if not roman_conflict:
593 1
                    return _fmt_retval(self._match_type_dict['omission'])
594 1
            elif (
595
                ln_equal
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
596
                and levenshtein(src_fn, tar_fn, cost=(1, 1, 99, 99)) == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
597
            ):
598 1
                return _fmt_retval(self._match_type_dict['omission'])
599 1 View Code Duplication
        if tests & self._test_dict['substitution']:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
600 1
            if (
601
                fn_equal
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
602
                and levenshtein(src_ln, tar_ln, cost=(99, 99, 1, 99)) == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
603
            ):
604 1
                return _fmt_retval(self._match_type_dict['substitution'])
605 1
            elif (
606
                ln_equal
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
607
                and levenshtein(src_fn, tar_fn, cost=(99, 99, 1, 99)) == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
608
            ):
609 1
                return _fmt_retval(self._match_type_dict['substitution'])
610 1 View Code Duplication
        if tests & self._test_dict['transposition']:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
611 1
            if fn_equal and (
612
                levenshtein(src_ln, tar_ln, mode='osa', cost=(99, 99, 99, 1))
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
613
                == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
614
            ):
615 1
                return _fmt_retval(self._match_type_dict['transposition'])
616 1
            elif ln_equal and (
617
                levenshtein(src_fn, tar_fn, mode='osa', cost=(99, 99, 99, 1))
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
618
                == 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
619
            ):
620 1
                return _fmt_retval(self._match_type_dict['transposition'])
621 1
        if tests & self._test_dict['punctuation']:
622 1
            np_src_fn = self._synoname_strip_punct(src_fn)
623 1
            np_tar_fn = self._synoname_strip_punct(tar_fn)
624 1
            np_src_ln = self._synoname_strip_punct(src_ln)
625 1
            np_tar_ln = self._synoname_strip_punct(tar_ln)
626
627 1
            if (np_src_fn == np_tar_fn) and (np_src_ln == np_tar_ln):
628 1
                return _fmt_retval(self._match_type_dict['punctuation'])
629
630 1
            np_src_fn = self._synoname_strip_punct(src_fn.replace('-', ' '))
631 1
            np_tar_fn = self._synoname_strip_punct(tar_fn.replace('-', ' '))
632 1
            np_src_ln = self._synoname_strip_punct(src_ln.replace('-', ' '))
633 1
            np_tar_ln = self._synoname_strip_punct(tar_ln.replace('-', ' '))
634
635 1
            if (np_src_fn == np_tar_fn) and (np_src_ln == np_tar_ln):
636 1
                return _fmt_retval(self._match_type_dict['punctuation'])
637
638 1
        if tests & self._test_dict['initials'] and ln_equal:
639 1
            if src_fn and tar_fn:
640 1
                src_initials = self._synoname_strip_punct(src_fn).split()
641 1
                tar_initials = self._synoname_strip_punct(tar_fn).split()
642 1
                initials = bool(
643
                    (len(src_initials) == len(''.join(src_initials)))
644
                    or (len(tar_initials) == len(''.join(tar_initials)))
645
                )
646 1
                if initials:
647 1
                    src_initials = ''.join(_[0] for _ in src_initials)
648 1
                    tar_initials = ''.join(_[0] for _ in tar_initials)
649 1
                    if src_initials == tar_initials:
650 1
                        return _fmt_retval(self._match_type_dict['initials'])
651 1
                    initial_diff = abs(len(src_initials) - len(tar_initials))
652 1
                    if initial_diff and (
653
                        (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
654
                            initial_diff
655
                            == levenshtein(
656
                                src_initials,
657
                                tar_initials,
658
                                cost=(1, 99, 99, 99),
659
                            )
660
                        )
661
                        or (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
662
                            initial_diff
663
                            == levenshtein(
664
                                tar_initials,
665
                                src_initials,
666
                                cost=(1, 99, 99, 99),
667
                            )
668
                        )
669
                    ):
670 1
                        return _fmt_retval(self._match_type_dict['initials'])
671 1
        if tests & self._test_dict['extension']:
672 1
            if src_ln[1] == tar_ln[1] and (
673
                src_ln.startswith(tar_ln) or tar_ln.startswith(src_ln)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
674
            ):
675 1
                if (
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (7/5)
Loading history...
676
                    (not src_len_fn and not tar_len_fn)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
677
                    or (tar_fn and src_fn.startswith(tar_fn))
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
678
                    or (src_fn and tar_fn.startswith(src_fn))
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
679
                ) and not roman_conflict:
680 1
                    return _fmt_retval(self._match_type_dict['extension'])
681 1
        if tests & self._test_dict['inclusion'] and ln_equal:
682 1
            if (src_fn and src_fn in tar_fn) or (tar_fn and tar_fn in src_ln):
683 1
                return _fmt_retval(self._match_type_dict['inclusion'])
684 1
        if tests & self._test_dict['no_first'] and ln_equal:
685 1
            if src_fn == '' or tar_fn == '':
686 1
                return _fmt_retval(self._match_type_dict['no_first'])
687 1
        if tests & self._test_dict['word_approx']:
688 1
            ratio = self._synoname_word_approximation(
689
                src_ln,
690
                tar_ln,
691
                src_fn,
692
                tar_fn,
693
                {
694
                    'gen_conflict': gen_conflict,
695
                    'roman_conflict': roman_conflict,
696
                    'src_specials': src_specials,
697
                    'tar_specials': tar_specials,
698
                },
699
            )
700 1
            if ratio == 1 and tests & self._test_dict['confusions']:
701 1
                if (
702
                    ' '.join((src_fn, src_ln)).strip()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
703
                    == ' '.join((tar_fn, tar_ln)).strip()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
704
                ):
705 1
                    return _fmt_retval(self._match_type_dict['confusions'])
706 1
            if ratio >= word_approx_min:
707 1
                return _fmt_retval(self._match_type_dict['word_approx'])
708 1
        if tests & self._test_dict['char_approx']:
709 1
            if ca_ratio >= char_approx_min:
710 1
                return _fmt_retval(self._match_type_dict['char_approx'])
711 1
        return _fmt_retval(self._match_type_dict['no_match'])
712
713 1
    def dist(
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist' method
Loading history...
best-practice introduced by
Too many arguments (6/5)
Loading history...
714
        self,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
715
        src,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
716
        tar,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
717
        word_approx_min=0.3,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
718
        char_approx_min=0.73,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
719
        tests=2 ** 12 - 1,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
720
    ):
721
        """Return the normalized Synoname distance between two words.
722
723
        Parameters
724
        ----------
725
        src : str
726
            Source string for comparison
727
        tar : str
728
            Target string for comparison
729
        word_approx_min : float
730
            The minimum word approximation value to signal a 'word_approx'
731
            match
732
        char_approx_min : float
733
            The minimum character approximation value to signal a 'char_approx'
734
            match
735
        tests : int or Iterable
736
            Either an integer indicating tests to perform or a list of test
737
            names to perform (defaults to performing all tests)
738
739
        Returns
740
        -------
741
        float
742
            Normalized Synoname distance
743
744
        """
745 1
        return (
746
            synoname(src, tar, word_approx_min, char_approx_min, tests, False)
747
            / 14
748
        )
749
750
751 1
def synoname(
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
752
    src,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
753
    tar,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
754
    word_approx_min=0.3,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
755
    char_approx_min=0.73,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
756
    tests=2 ** 12 - 1,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
757
    ret_name=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
758
):
759
    """Return the Synoname similarity type of two words.
760
761
    This is a wrapper for :py:meth:`Synoname.dist_abs`.
762
763
    Parameters
764
    ----------
765
    src : str
766
        Source string for comparison
767
    tar : str
768
        Target string for comparison
769
    word_approx_min : float
770
        The minimum word approximation value to signal a 'word_approx' match
771
    char_approx_min : float
772
        The minimum character approximation value to signal a 'char_approx'
773
        match
774
    tests : int or Iterable
775
        Either an integer indicating tests to perform or a list of test names
776
        to perform (defaults to performing all tests)
777
    ret_name : bool
778
        If True, returns the match name rather than its integer equivalent
779
780
    Returns
781
    -------
782
    int (or str if ret_name is True)
783
        Synoname value
784
785
    Examples
786
    --------
787
    >>> synoname(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''))
788
    2
789
    >>> synoname(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''),
790
    ... ret_name=True)
791
    'omission'
792
    >>> synoname(('Dore', 'Gustave', ''),
793
    ... ('Dore', 'Paul Gustave Louis Christophe', ''), ret_name=True)
794
    'inclusion'
795
    >>> synoname(('Pereira', 'I. R.', ''), ('Pereira', 'I. Smith', ''),
796
    ... ret_name=True)
797
    'word_approx'
798
799
    """
800 1
    return Synoname().dist_abs(
801
        src, tar, word_approx_min, char_approx_min, tests, ret_name
802
    )
803
804
805
if __name__ == '__main__':
806
    import doctest
807
808
    doctest.testmod()
809