abydos.distance._synoname.synoname() - Code Metrics - Inspection of "Merge pull request #248 from chrislit/0.6.0" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( c2a3b6...15a61d )

by Chris

created 2020-01-12 22:24 UTC

abydos.distance._synoname.synoname() A

↳ Parent: Project

Complexity

Conditions

Size

Total Lines	60
Code Lines	15

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	3
CRAP Score	1

Importance

Changes

Metric	Value
eloc	15
dl	0
loc	60
ccs	3
cts	3
cp	1
rs	9.65
c	0
b	0
f	0
cc	1
nop	6
crap	1

How to fix Long Method

# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._synoname.

Synoname.
"""

from collections import Iterable

from ._distance import _Distance
from ._levenshtein import Levenshtein
from ._ratcliff_obershelp import RatcliffObershelp

# noinspection PyProtectedMember
from ..fingerprint._synoname_toolcode import SynonameToolcode

__all__ = ['Synoname']


class Synoname(_Distance):
    """Synoname.

    Cf. :cite:`Getty:1991,Gross:1991`

    .. versionadded:: 0.3.6
    """

    _lev = Levenshtein()
    _ratcliff_obershelp = RatcliffObershelp()

    _stc = SynonameToolcode()

    _test_dict = {
        val: 2 ** n
        for n, val in enumerate(
            (
                'exact',
                'omission',
                'substitution',
                'transposition',
                'punctuation',
                'initials',
                'extension',
                'inclusion',
                'no_first',
                'word_approx',
                'confusions',
                'char_approx',
            )
        )
    }
    _match_name = (
        '',
        'exact',
        'omission',
        'substitution',
        'transposition',
        'punctuation',
        'initials',
        'extension',
        'inclusion',
        'no_first',
        'word_approx',
        'confusions',
        'char_approx',
        'no_match',
    )
    _match_type_dict = {val: n for n, val in enumerate(_match_name)}

    def _synoname_strip_punct(self, word):
        """Return a word with punctuation stripped out.

        Parameters
        ----------
        word : str
            A word to strip punctuation from

        Returns
        -------
        str
            The word stripped of punctuation

        Examples
        --------
        >>> pe = Synoname()
        >>> pe._synoname_strip_punct('AB;CD EF-GH$IJ')
        'ABCD EFGHIJ'


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        stripped = ''
        for char in word:
            if char not in set(',-./:;"&\'()!{|}?$%*+<=>[\\]^_`~'):
                stripped += char
        return stripped.strip()

    def _synoname_word_approximation(
        self, src_ln, tar_ln, src_fn='', tar_fn='', features=None
    ):
        """Return the Synoname word approximation score for two names.

        Parameters
        ----------
        src_ln : str
            Last name of the source
        tar_ln : str
            Last name of the target
        src_fn : str
            First name of the source (optional)
        tar_fn : str
            First name of the target (optional)
        features : dict
            A dict containing special features calculated using
            :py:class:`fingerprint.SynonameToolcode` (optional)

        Returns
        -------
        float
            The word approximation score

        Examples
        --------
        >>> pe = Synoname()
        >>> pe._synoname_word_approximation('Smith Waterman', 'Waterman',
        ... 'Tom Joe Bob', 'Tom Joe')
        0.6


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        if features is None:
            features = {}
        if 'src_specials' not in features:
            features['src_specials'] = []
        if 'tar_specials' not in features:
            features['tar_specials'] = []

        src_len_specials = len(features['src_specials'])
        tar_len_specials = len(features['tar_specials'])

        # 1
        if ('gen_conflict' in features and features['gen_conflict']) or (
            'roman_conflict' in features and features['roman_conflict']
        ):
            return 0

        # 3 & 7
        full_tar1 = ' '.join((tar_ln, tar_fn)).replace('-', ' ').strip()
        for s_pos, s_type in features['tar_specials']:
            if s_type == 'a':
                full_tar1 = full_tar1[
                    : -(
                        1
                        + len(
                            self._stc._synoname_special_table[  # noqa: SF01
                                s_pos
                            ][1]
                        )
                    )
                ]
            elif s_type == 'b':
                loc = (
                    full_tar1.find(
                        ' '
                        + self._stc._synoname_special_table[  # noqa: SF01
                            s_pos
                        ][1]
                        + ' '
                    )
                    + 1
                )
                full_tar1 = (
                    full_tar1[:loc]
                    + full_tar1[
                        loc
                        + len(
                            self._stc._synoname_special_table[  # noqa: SF01
                                s_pos
                            ][1]
                        ) :
                    ]
                )
            elif s_type == 'c':
                full_tar1 = full_tar1[
                    1
                    + len(
                        self._stc._synoname_special_table[s_pos][  # noqa: SF01
                            1
                        ]
                    ) :
                ]

        full_src1 = ' '.join((src_ln, src_fn)).replace('-', ' ').strip()
        for s_pos, s_type in features['src_specials']:
            if s_type == 'a':
                full_src1 = full_src1[
                    : -(
                        1
                        + len(
                            self._stc._synoname_special_table[  # noqa: SF01
                                s_pos
                            ][1]
                        )
                    )
                ]
            elif s_type == 'b':
                loc = (
                    full_src1.find(
                        ' '
                        + self._stc._synoname_special_table[  # noqa: SF01
                            s_pos
                        ][1]
                        + ' '
                    )
                    + 1
                )
                full_src1 = (
                    full_src1[:loc]
                    + full_src1[
                        loc
                        + len(
                            self._stc._synoname_special_table[  # noqa: SF01
                                s_pos
                            ][1]
                        ) :
                    ]
                )
            elif s_type == 'c':
                full_src1 = full_src1[
                    1
                    + len(
                        self._stc._synoname_special_table[s_pos][  # noqa: SF01
                            1
                        ]
                    ) :
                ]

        full_tar2 = full_tar1
        for s_pos, s_type in features['tar_specials']:
            if s_type == 'd':
                full_tar2 = full_tar2[
                    len(
                        self._stc._synoname_special_table[s_pos][  # noqa: SF01
                            1
                        ]
                    ) :
                ]
            elif (
                s_type == 'X'
                and self._stc._synoname_special_table[s_pos][1]  # noqa: SF01
                in full_tar2
            ):
                loc = full_tar2.find(
                    ' '
                    + self._stc._synoname_special_table[s_pos][1]  # noqa: SF01
                )
                full_tar2 = (
                    full_tar2[:loc]
                    + full_tar2[
                        loc
                        + len(
                            self._stc._synoname_special_table[  # noqa: SF01
                                s_pos
                            ][1]
                        ) :
                    ]
                )

        full_src2 = full_src1
        for s_pos, s_type in features['src_specials']:
            if s_type == 'd':
                full_src2 = full_src2[
                    len(
                        self._stc._synoname_special_table[s_pos][  # noqa: SF01
                            1
                        ]
                    ) :
                ]
            elif (
                s_type == 'X'
                and self._stc._synoname_special_table[s_pos][1]  # noqa: SF01
                in full_src2
            ):
                loc = full_src2.find(
                    ' '
                    + self._stc._synoname_special_table[s_pos][1]  # noqa: SF01
                )
                full_src2 = (
                    full_src2[:loc]
                    + full_src2[
                        loc
                        + len(
                            self._stc._synoname_special_table[  # noqa: SF01
                                s_pos
                            ][1]
                        ) :
                    ]
                )

        full_tar1 = self._synoname_strip_punct(full_tar1)
        tar1_words = full_tar1.split()
        tar1_num_words = len(tar1_words)

        full_src1 = self._synoname_strip_punct(full_src1)
        src1_words = full_src1.split()
        src1_num_words = len(src1_words)

        full_tar2 = self._synoname_strip_punct(full_tar2)
        tar2_words = full_tar2.split()
        tar2_num_words = len(tar2_words)

        full_src2 = self._synoname_strip_punct(full_src2)
        src2_words = full_src2.split()
        src2_num_words = len(src2_words)

        # 2
        if (
            src1_num_words < 2
            and src_len_specials == 0
            and src2_num_words < 2
            and tar_len_specials == 0
        ):
            return 0

        # 4
        if (
            tar1_num_words == 1
            and src1_num_words == 1
            and tar1_words[0] == src1_words[0]
        ):
            return 1
        if tar1_num_words < 2 and tar_len_specials == 0:
            return 0

        # 5
        last_found = False
        for word in tar1_words:
            if src_ln.endswith(word) or word + ' ' in src_ln:
                last_found = True

        if not last_found:
            for word in src1_words:
                if tar_ln.endswith(word) or word + ' ' in tar_ln:
                    last_found = True

        # 6
        matches = 0
        if last_found:
            for i, s_word in enumerate(src1_words):
                for j, t_word in enumerate(tar1_words):
                    if s_word == t_word:
                        src1_words[i] = '@'
                        tar1_words[j] = '@'
                        matches += 1
        w_ratio = matches / max(tar1_num_words, src1_num_words)
        if matches > 1 or (
            matches == 1
            and src1_num_words == 1
            and tar1_num_words == 1
            and (tar_len_specials > 0 or src_len_specials > 0)
        ):
            return w_ratio

        # 8
        if (
            tar2_num_words == 1
            and src2_num_words == 1
            and tar2_words[0] == src2_words[0]
        ):
            return 1
        # I see no way that the following can be True if the equivalent in
        # #4 was False.
        if tar2_num_words < 2 and tar_len_specials == 0:  # pragma: no cover
            return 0

        # 9
        last_found = False
        for word in tar2_words:
            if src_ln.endswith(word) or word + ' ' in src_ln:
                last_found = True

        if not last_found:
            for word in src2_words:
                if tar_ln.endswith(word) or word + ' ' in tar_ln:
                    last_found = True

        if not last_found:
            return 0

        # 10
        matches = 0
        if last_found:
            for i, s_word in enumerate(src2_words):
                for j, t_word in enumerate(tar2_words):
                    if s_word == t_word:
                        src2_words[i] = '@'
                        tar2_words[j] = '@'
                        matches += 1
        w_ratio = matches / max(tar2_num_words, src2_num_words)
        if matches > 1 or (
            matches == 1
            and src2_num_words == 1
            and tar2_num_words == 1
            and (tar_len_specials > 0 or src_len_specials > 0)
        ):
            return w_ratio

        return 0

    def __init__(
        self,
        word_approx_min=0.3,
        char_approx_min=0.73,
        tests=2 ** 12 - 1,
        ret_name=False,
        **kwargs
    ):
        """Initialize Synoname instance.

        Parameters
        ----------
        word_approx_min : float
            The minimum word approximation value to signal a 'word_approx'
            match
        char_approx_min : float
            The minimum character approximation value to signal a 'char_approx'
            match
        tests : int or Iterable
            Either an integer indicating tests to perform or a list of test
            names to perform (defaults to performing all tests)
        ret_name : bool
            If True, returns the match name rather than its integer equivalent
        **kwargs
            Arbitrary keyword arguments


        .. versionadded:: 0.4.0

        """
        super(Synoname, self).__init__(**kwargs)
        self._word_approx_min = word_approx_min
        self._char_approx_min = char_approx_min
        self._ret_name = ret_name

        self._tests = tests
        if isinstance(self._tests, Iterable):
            new_tests = 0
            for term in self._tests:
                if term in self._test_dict:
                    new_tests += self._test_dict[term]
            self._tests = new_tests

    def dist_abs(self, src, tar, force_numeric=False):
        """Return the Synoname similarity type of two words.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison
        force_numeric : bool
            Overrides the instance's ret_name setting

        Returns
        -------
        int (or str if ret_name is True)
            Synoname value

        Examples
        --------
        >>> cmp = Synoname()
        >>> cmp.dist_abs(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''))
        2

        >>> cmp = Synoname(ret_name=True)
        >>> cmp.dist_abs(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''))
        'omission'
        >>> cmp.dist_abs(('Dore', 'Gustave', ''),
        ... ('Dore', 'Paul Gustave Louis Christophe', ''))
        'inclusion'
        >>> cmp.dist_abs(('Pereira', 'I. R.', ''), ('Pereira', 'I. Smith', ''))
        'word_approx'


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        if isinstance(src, tuple):
            src_ln, src_fn, src_qual = src
        elif '#' in src:
            src_ln, src_fn, src_qual = src.split('#')[-3:]
        else:
            src_ln, src_fn, src_qual = src, '', ''

        if isinstance(tar, tuple):
            tar_ln, tar_fn, tar_qual = tar
        elif '#' in tar:
            tar_ln, tar_fn, tar_qual = tar.split('#')[-3:]
        else:
            tar_ln, tar_fn, tar_qual = tar, '', ''

        def _split_special(spec):
            spec_list = []
            while spec:
                spec_list.append((int(spec[:3]), spec[3:4]))
                spec = spec[4:]
            return spec_list

        def _fmt_retval(val):
            if self._ret_name and not force_numeric:
                return self._match_name[val]
            return val

        # 1. Preprocessing

        # Lowercasing
        src_fn = src_fn.strip().lower()
        src_ln = src_ln.strip().lower()
        src_qual = src_qual.strip().lower()

        tar_fn = tar_fn.strip().lower()
        tar_ln = tar_ln.strip().lower()
        tar_qual = tar_qual.strip().lower()

        # Create toolcodes
        src_ln, src_fn, src_tc = self._stc.fingerprint(
            src_ln, src_fn, src_qual
        )
        tar_ln, tar_fn, tar_tc = self._stc.fingerprint(
            tar_ln, tar_fn, tar_qual
        )

        src_generation = int(src_tc[2])
        src_romancode = int(src_tc[3:6])
        src_len_fn = int(src_tc[6:8])
        src_tc = src_tc.split('$')
        src_specials = _split_special(src_tc[1])

        tar_generation = int(tar_tc[2])
        tar_romancode = int(tar_tc[3:6])
        tar_len_fn = int(tar_tc[6:8])
        tar_tc = tar_tc.split('$')
        tar_specials = _split_special(tar_tc[1])

        gen_conflict = (src_generation != tar_generation) and bool(
            src_generation or tar_generation
        )
        roman_conflict = (src_romancode != tar_romancode) and bool(
            src_romancode or tar_romancode
        )

        ln_equal = src_ln == tar_ln
        fn_equal = src_fn == tar_fn

        # approx_c
        def _approx_c():
            if gen_conflict or roman_conflict:
                return False, 0

            full_src = ' '.join((src_ln, src_fn))
            if full_src.startswith('master '):
                full_src = full_src[len('master ') :]
                for intro in [
                    'of the ',
                    'of ',
                    'known as the ',
                    'with the ',
                    'with ',
                ]:
                    if full_src.startswith(intro):
                        full_src = full_src[len(intro) :]

            full_tar = ' '.join((tar_ln, tar_fn))
            if full_tar.startswith('master '):
                full_tar = full_tar[len('master ') :]
                for intro in [
                    'of the ',
                    'of ',
                    'known as the ',
                    'with the ',
                    'with ',
                ]:
                    if full_tar.startswith(intro):
                        full_tar = full_tar[len(intro) :]

            loc_ratio = self._ratcliff_obershelp.sim(full_src, full_tar)
            return loc_ratio >= self._char_approx_min, loc_ratio

        approx_c_result, ca_ratio = _approx_c()

        if self._tests & self._test_dict['exact'] and fn_equal and ln_equal:
            return _fmt_retval(self._match_type_dict['exact'])
        if self._tests & self._test_dict['omission']:
            self._lev._cost = (1, 1, 99, 99)  # noqa: SF01
            self._lev._mode = 'lev'  # noqa: SF01
            if fn_equal and self._lev.dist_abs(src_ln, tar_ln) == 1:
                if not roman_conflict:
                    return _fmt_retval(self._match_type_dict['omission'])
            elif ln_equal and self._lev.dist_abs(src_fn, tar_fn) == 1:
                return _fmt_retval(self._match_type_dict['omission'])
        if self._tests & self._test_dict['substitution']:
            self._lev._cost = (99, 99, 1, 99)  # noqa: SF01
            self._lev._mode = 'lev'  # noqa: SF01
            if fn_equal and self._lev.dist_abs(src_ln, tar_ln) == 1:
                return _fmt_retval(self._match_type_dict['substitution'])
            elif ln_equal and self._lev.dist_abs(src_fn, tar_fn) == 1:
                return _fmt_retval(self._match_type_dict['substitution'])
        if self._tests & self._test_dict['transposition']:
            self._lev._cost = (99, 99, 99, 1)  # noqa: SF01
            self._lev._mode = 'osa'  # noqa: SF01
            if fn_equal and (self._lev.dist_abs(src_ln, tar_ln) == 1):
                return _fmt_retval(self._match_type_dict['transposition'])
            elif ln_equal and (self._lev.dist_abs(src_fn, tar_fn) == 1):
                return _fmt_retval(self._match_type_dict['transposition'])
        if self._tests & self._test_dict['punctuation']:
            np_src_fn = self._synoname_strip_punct(src_fn)
            np_tar_fn = self._synoname_strip_punct(tar_fn)
            np_src_ln = self._synoname_strip_punct(src_ln)
            np_tar_ln = self._synoname_strip_punct(tar_ln)

            if (np_src_fn == np_tar_fn) and (np_src_ln == np_tar_ln):
                return _fmt_retval(self._match_type_dict['punctuation'])

            np_src_fn = self._synoname_strip_punct(src_fn.replace('-', ' '))
            np_tar_fn = self._synoname_strip_punct(tar_fn.replace('-', ' '))
            np_src_ln = self._synoname_strip_punct(src_ln.replace('-', ' '))
            np_tar_ln = self._synoname_strip_punct(tar_ln.replace('-', ' '))

            if (np_src_fn == np_tar_fn) and (np_src_ln == np_tar_ln):
                return _fmt_retval(self._match_type_dict['punctuation'])

        if self._tests & self._test_dict['initials'] and ln_equal:
            if src_fn and tar_fn:
                src_initials = self._synoname_strip_punct(src_fn).split()
                tar_initials = self._synoname_strip_punct(tar_fn).split()
                initials = bool(
                    (len(src_initials) == len(''.join(src_initials)))
                    or (len(tar_initials) == len(''.join(tar_initials)))
                )
                if initials:
                    src_initials = ''.join(_[0] for _ in src_initials)
                    tar_initials = ''.join(_[0] for _ in tar_initials)
                    if src_initials == tar_initials:
                        return _fmt_retval(self._match_type_dict['initials'])
                    initial_diff = abs(len(src_initials) - len(tar_initials))
                    self._lev._cost = (1, 99, 99, 99)  # noqa: SF01
                    self._lev._mode = 'lev'  # noqa: SF01
                    if initial_diff and (
                        (
                            initial_diff
                            == self._lev.dist_abs(src_initials, tar_initials,)
                        )
                        or (
                            initial_diff
                            == self._lev.dist_abs(tar_initials, src_initials,)
                        )
                    ):
                        return _fmt_retval(self._match_type_dict['initials'])
        if self._tests & self._test_dict['extension']:
            if src_ln[1:2] == tar_ln[1:2] and (
                src_ln.startswith(tar_ln) or tar_ln.startswith(src_ln)
            ):
                if (
                    (not src_len_fn and not tar_len_fn)
                    or (tar_fn and src_fn.startswith(tar_fn))
                    or (src_fn and tar_fn.startswith(src_fn))
                ) and not roman_conflict:
                    return _fmt_retval(self._match_type_dict['extension'])
        if self._tests & self._test_dict['inclusion'] and ln_equal:
            if (src_fn and src_fn in tar_fn) or (tar_fn and tar_fn in src_ln):
                return _fmt_retval(self._match_type_dict['inclusion'])
        if self._tests & self._test_dict['no_first'] and ln_equal:
            if src_fn == '' or tar_fn == '':
                return _fmt_retval(self._match_type_dict['no_first'])
        if self._tests & self._test_dict['word_approx']:
            ratio = self._synoname_word_approximation(
                src_ln,
                tar_ln,
                src_fn,
                tar_fn,
                {
                    'gen_conflict': gen_conflict,
                    'roman_conflict': roman_conflict,
                    'src_specials': src_specials,
                    'tar_specials': tar_specials,
                },
            )
            if ratio == 1 and self._tests & self._test_dict['confusions']:
                if (
                    ' '.join((src_fn, src_ln)).strip()
                    == ' '.join((tar_fn, tar_ln)).strip()
                ):
                    return _fmt_retval(self._match_type_dict['confusions'])
            if ratio >= self._word_approx_min:
                return _fmt_retval(self._match_type_dict['word_approx'])
        if self._tests & self._test_dict['char_approx']:
            if ca_ratio >= self._char_approx_min:
                return _fmt_retval(self._match_type_dict['char_approx'])
        return _fmt_retval(self._match_type_dict['no_match'])

    def dist(self, src, tar):
        """Return the normalized Synoname distance between two words.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            Normalized Synoname distance


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        return self.dist_abs(src, tar, force_numeric=True) / 14


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# Copyright 2018-2020 by Christopher C. Little.
2		# This file is part of Abydos.
3		#
4		# Abydos is free software: you can redistribute it and/or modify
5		# it under the terms of the GNU General Public License as published by
6		# the Free Software Foundation, either version 3 of the License, or
7		# (at your option) any later version.
8		#
9		# Abydos is distributed in the hope that it will be useful,
10		# but WITHOUT ANY WARRANTY; without even the implied warranty of
11		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12		# GNU General Public License for more details.
13		#
14		# You should have received a copy of the GNU General Public License
15		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17		"""abydos.distance._synoname.
18
19	1	Synoname.
20		"""
21
22		from collections import Iterable
23
24	1	from ._distance import _Distance
25		from ._levenshtein import Levenshtein
26		from ._ratcliff_obershelp import RatcliffObershelp
27
28		# noinspection PyProtectedMember
29		from ..fingerprint._synoname_toolcode import SynonameToolcode
30
31	1	__all__ = ['Synoname']
32
33	1
34		class Synoname(_Distance):
35	1	"""Synoname.
36	1
37	1	Cf. :cite:`Getty:1991,Gross:1991`
38	1
39		.. versionadded:: 0.3.6
40		"""
41	1
42		_lev = Levenshtein()
43	1	_ratcliff_obershelp = RatcliffObershelp()
44
45		_stc = SynonameToolcode()
46	1
47		_test_dict = {
48		val: 2 ** n
49		for n, val in enumerate(
50		(
51		'exact',
52		'omission',
53		'substitution',
54	1	'transposition',
55		'punctuation',
56	1	'initials',
57		'extension',
58		'inclusion',
59		'no_first',
60		'word_approx',
61		'confusions',
62		'char_approx',
63		)
64		)
65		}
66		_match_name = (
67		'',
68		'exact',
69		'omission',
70		'substitution',
71		'transposition',
72		'punctuation',
73		'initials',
74		'extension',
75	1	'inclusion',
76		'no_first',
77		'word_approx',
78		'confusions',
79		'char_approx',
80		'no_match',
81		)
82		_match_type_dict = {val: n for n, val in enumerate(_match_name)}
83
84		def _synoname_strip_punct(self, word):
85		"""Return a word with punctuation stripped out.
86
87		Parameters
88		----------
89		word : str
90		A word to strip punctuation from
91	1
92		Returns
93	1	-------
94		str
95		The word stripped of punctuation
96
97		Examples
98		--------
99		>>> pe = Synoname()
100		>>> pe._synoname_strip_punct('AB;CD EF-GH$IJ')
101		'ABCD EFGHIJ'
102
103
104		.. versionadded:: 0.3.0
105		.. versionchanged:: 0.3.6
106		Encapsulated in class
107
108		"""
109		stripped = ''
110		for char in word:
111		if char not in set(',-./:;"&\'()!{\|}?$%*+<=>[\\]^_`~'):
112		stripped += char
113		return stripped.strip()
114
115		def _synoname_word_approximation(
116		self, src_ln, tar_ln, src_fn='', tar_fn='', features=None
117		):
118	1	"""Return the Synoname word approximation score for two names.
119	1
120	1	Parameters
121	1	----------
122	1	src_ln : str
123		Last name of the source
124	1	tar_ln : str
125		Last name of the target
126		src_fn : str
127		First name of the source (optional)
128		tar_fn : str
129		First name of the target (optional)
130		features : dict
131		A dict containing special features calculated using
132		:py:class:`fingerprint.SynonameToolcode` (optional)
133
134		Returns
135		-------
136		float
137		The word approximation score
138
139		Examples
140		--------
141		>>> pe = Synoname()
142		>>> pe._synoname_word_approximation('Smith Waterman', 'Waterman',
143		... 'Tom Joe Bob', 'Tom Joe')
144		0.6
145
146
147		.. versionadded:: 0.3.0
148		.. versionchanged:: 0.3.6
149		Encapsulated in class
150
151		"""
152		if features is None:
153		features = {}
154		if 'src_specials' not in features:
155		features['src_specials'] = []
156		if 'tar_specials' not in features:
157		features['tar_specials'] = []
158
159		src_len_specials = len(features['src_specials'])
160		tar_len_specials = len(features['tar_specials'])
161	1
162	1	# 1
163	1	if ('gen_conflict' in features and features['gen_conflict']) or (
164	1	'roman_conflict' in features and features['roman_conflict']
165	1	):
166	1	return 0
167
168	1	# 3 & 7
169	1	full_tar1 = ' '.join((tar_ln, tar_fn)).replace('-', ' ').strip()
170		for s_pos, s_type in features['tar_specials']:
171		if s_type == 'a':
172	1	full_tar1 = full_tar1[
173		: -(
174		1
175	1	+ len(
176		self._stc._synoname_special_table[ # noqa: SF01
177		s_pos
178	1	][1]
179	1	)
180	1	)
181	1	]
182		elif s_type == 'b':
183		loc = (
184		full_tar1.find(
185		' '
186		+ self._stc._synoname_special_table[ # noqa: SF01
187		s_pos
188		][1]
189		+ ' '
190		)
191	1	+ 1
192	1	)
193		full_tar1 = (
194		full_tar1[:loc]
195		+ full_tar1[
196		loc
197		+ len(
198		self._stc._synoname_special_table[ # noqa: SF01
199		s_pos
200		][1]
201		) :
202	1	]
203		)
204		elif s_type == 'c':
205		full_tar1 = full_tar1[
206		1
207		+ len(
208		self._stc._synoname_special_table[s_pos][ # noqa: SF01
209		1
210		]
211		) :
212		]
213	1
214	1	full_src1 = ' '.join((src_ln, src_fn)).replace('-', ' ').strip()
215		for s_pos, s_type in features['src_specials']:
216		if s_type == 'a':
217		full_src1 = full_src1[
218		: -(
219		1
220		+ len(
221		self._stc._synoname_special_table[ # noqa: SF01
222		s_pos
223	1	][1]
224	1	)
225	1	)
226	1	]
227		elif s_type == 'b':
228		loc = (
229		full_src1.find(
230		' '
231		+ self._stc._synoname_special_table[ # noqa: SF01
232		s_pos
233		][1]
234		+ ' '
235		)
236	1	+ 1
237	1	)
238		full_src1 = (
239		full_src1[:loc]
240		+ full_src1[
241		loc
242		+ len(
243		self._stc._synoname_special_table[ # noqa: SF01
244		s_pos
245		][1]
246		) :
247	1	]
248		)
249		elif s_type == 'c':
250		full_src1 = full_src1[
251		1
252		+ len(
253		self._stc._synoname_special_table[s_pos][ # noqa: SF01
254		1
255		]
256		) :
257		]
258	1
259	1	full_tar2 = full_tar1
260		for s_pos, s_type in features['tar_specials']:
261		if s_type == 'd':
262		full_tar2 = full_tar2[
263		len(
264		self._stc._synoname_special_table[s_pos][ # noqa: SF01
265		1
266		]
267		) :
268	1	]
269	1	elif (
270	1	s_type == 'X'
271	1	and self._stc._synoname_special_table[s_pos][1] # noqa: SF01
272		in full_tar2
273		):
274		loc = full_tar2.find(
275		' '
276		+ self._stc._synoname_special_table[s_pos][1] # noqa: SF01
277		)
278	1	full_tar2 = (
279		full_tar2[:loc]
280		+ full_tar2[
281		loc
282		+ len(
283	1	self._stc._synoname_special_table[ # noqa: SF01
284		s_pos
285		][1]
286		) :
287	1	]
288		)
289
290		full_src2 = full_src1
291		for s_pos, s_type in features['src_specials']:
292		if s_type == 'd':
293		full_src2 = full_src2[
294		len(
295		self._stc._synoname_special_table[s_pos][ # noqa: SF01
296		1
297		]
298		) :
299	1	]
300	1	elif (
301	1	s_type == 'X'
302	1	and self._stc._synoname_special_table[s_pos][1] # noqa: SF01
303		in full_src2
304		):
305		loc = full_src2.find(
306		' '
307		+ self._stc._synoname_special_table[s_pos][1] # noqa: SF01
308		)
309	1	full_src2 = (
310		full_src2[:loc]
311		+ full_src2[
312		loc
313		+ len(
314	1	self._stc._synoname_special_table[ # noqa: SF01
315		s_pos
316		][1]
317		) :
318	1	]
319		)
320
321		full_tar1 = self._synoname_strip_punct(full_tar1)
322		tar1_words = full_tar1.split()
323		tar1_num_words = len(tar1_words)
324
325		full_src1 = self._synoname_strip_punct(full_src1)
326		src1_words = full_src1.split()
327		src1_num_words = len(src1_words)
328
329		full_tar2 = self._synoname_strip_punct(full_tar2)
330	1	tar2_words = full_tar2.split()
331	1	tar2_num_words = len(tar2_words)
332	1
333		full_src2 = self._synoname_strip_punct(full_src2)
334	1	src2_words = full_src2.split()
335	1	src2_num_words = len(src2_words)
336	1
337		# 2
338	1	if (
339	1	src1_num_words < 2
340	1	and src_len_specials == 0
341		and src2_num_words < 2
342	1	and tar_len_specials == 0
343	1	):
344	1	return 0
345
346		# 4
347	1	if (
348		tar1_num_words == 1
349		and src1_num_words == 1
350		and tar1_words[0] == src1_words[0]
351		):
352		return 1
353	1	if tar1_num_words < 2 and tar_len_specials == 0:
354		return 0
355
356	1	# 5
357		last_found = False
358		for word in tar1_words:
359		if src_ln.endswith(word) or word + ' ' in src_ln:
360		last_found = True
361	1
362	1	if not last_found:
363	1	for word in src1_words:
364		if tar_ln.endswith(word) or word + ' ' in tar_ln:
365		last_found = True
366	1
367	1	# 6
368	1	matches = 0
369	1	if last_found:
370		for i, s_word in enumerate(src1_words):
371	1	for j, t_word in enumerate(tar1_words):
372	1	if s_word == t_word:
373	1	src1_words[i] = '@'
374	1	tar1_words[j] = '@'
375		matches += 1
376		w_ratio = matches / max(tar1_num_words, src1_num_words)
377	1	if matches > 1 or (
378	1	matches == 1
379	1	and src1_num_words == 1
380	1	and tar1_num_words == 1
381	1	and (tar_len_specials > 0 or src_len_specials > 0)
382	1	):
383	1	return w_ratio
384	1
385	1	# 8
386	1	if (
387		tar2_num_words == 1
388		and src2_num_words == 1
389		and tar2_words[0] == src2_words[0]
390		):
391		return 1
392	1	# I see no way that the following can be True if the equivalent in
393		# #4 was False.
394		if tar2_num_words < 2 and tar_len_specials == 0: # pragma: no cover
395	1	return 0
396
397		# 9
398		last_found = False
399		for word in tar2_words:
400	1	if src_ln.endswith(word) or word + ' ' in src_ln:
401		last_found = True
402
403		if not last_found:
404		for word in src2_words:
405		if tar_ln.endswith(word) or word + ' ' in tar_ln:
406		last_found = True
407	1
408	1	if not last_found:
409	1	return 0
410	1
411		# 10
412	1	matches = 0
413	1	if last_found:
414	1	for i, s_word in enumerate(src2_words):
415	1	for j, t_word in enumerate(tar2_words):
416		if s_word == t_word:
417	1	src2_words[i] = '@'
418	1	tar2_words[j] = '@'
419		matches += 1
420		w_ratio = matches / max(tar2_num_words, src2_num_words)
421	1	if matches > 1 or (
422	1	matches == 1
423	1	and src2_num_words == 1
424	1	and tar2_num_words == 1
425	1	and (tar_len_specials > 0 or src_len_specials > 0)
426	1	):
427	1	return w_ratio
428	1
429	1	return 0
430	1
431		def __init__(
432		self,
433		word_approx_min=0.3,
434		char_approx_min=0.73,
435		tests=2 ** 12 - 1,
436		ret_name=False,
437		**kwargs
438	1	):
439		"""Initialize Synoname instance.
440	1
441		Parameters
442		----------
443		word_approx_min : float
444		The minimum word approximation value to signal a 'word_approx'
445		match
446		char_approx_min : float
447		The minimum character approximation value to signal a 'char_approx'
448		match
449		tests : int or Iterable
450		Either an integer indicating tests to perform or a list of test
451		names to perform (defaults to performing all tests)
452		ret_name : bool
453		If True, returns the match name rather than its integer equivalent
454		**kwargs
455		Arbitrary keyword arguments
456
457
458		.. versionadded:: 0.4.0
459
460		"""
461		super(Synoname, self).__init__(**kwargs)
462		self._word_approx_min = word_approx_min
463		self._char_approx_min = char_approx_min
464		self._ret_name = ret_name
465
466		self._tests = tests
467		if isinstance(self._tests, Iterable):
468		new_tests = 0
469		for term in self._tests:
470	1	if term in self._test_dict:
471	1	new_tests += self._test_dict[term]
472	1	self._tests = new_tests
473	1
474		def dist_abs(self, src, tar, force_numeric=False):
475	1	"""Return the Synoname similarity type of two words.
476	1
477	1	Parameters
478	1	----------
479	1	src : str
480	1	Source string for comparison
481	1	tar : str
482		Target string for comparison
483	1	force_numeric : bool
484		Overrides the instance's ret_name setting
485
486		Returns
487		-------
488		int (or str if ret_name is True)
489		Synoname value
490
491		Examples
492		--------
493		>>> cmp = Synoname()
494		>>> cmp.dist_abs(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''))
495		2
496
497		>>> cmp = Synoname(ret_name=True)
498		>>> cmp.dist_abs(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''))
499		'omission'
500		>>> cmp.dist_abs(('Dore', 'Gustave', ''),
501		... ('Dore', 'Paul Gustave Louis Christophe', ''))
502		'inclusion'
503		>>> cmp.dist_abs(('Pereira', 'I. R.', ''), ('Pereira', 'I. Smith', ''))
504		'word_approx'
505
506
507		.. versionadded:: 0.3.0
508		.. versionchanged:: 0.3.6
509		Encapsulated in class
510
511		"""
512		if isinstance(src, tuple):
513		src_ln, src_fn, src_qual = src
514		elif '#' in src:
515		src_ln, src_fn, src_qual = src.split('#')[-3:]
516		else:
517		src_ln, src_fn, src_qual = src, '', ''
518
519		if isinstance(tar, tuple):
520		tar_ln, tar_fn, tar_qual = tar
521	1	elif '#' in tar:
522	1	tar_ln, tar_fn, tar_qual = tar.split('#')[-3:]
523	1	else:
524	1	tar_ln, tar_fn, tar_qual = tar, '', ''
525
526	1	def _split_special(spec):
527		spec_list = []
528	1	while spec:
529	1	spec_list.append((int(spec[:3]), spec[3:4]))
530	1	spec = spec[4:]
531	1	return spec_list
532
533	1	def _fmt_retval(val):
534		if self._ret_name and not force_numeric:
535	1	return self._match_name[val]
536	1	return val
537	1
538	1	# 1. Preprocessing
539	1
540	1	# Lowercasing
541		src_fn = src_fn.strip().lower()
542	1	src_ln = src_ln.strip().lower()
543	1	src_qual = src_qual.strip().lower()
544	1
545	1	tar_fn = tar_fn.strip().lower()
546		tar_ln = tar_ln.strip().lower()
547		tar_qual = tar_qual.strip().lower()
548
549		# Create toolcodes
550	1	src_ln, src_fn, src_tc = self._stc.fingerprint(
551	1	src_ln, src_fn, src_qual
552	1	)
553		tar_ln, tar_fn, tar_tc = self._stc.fingerprint(
554	1	tar_ln, tar_fn, tar_qual
555	1	)
556	1
557		src_generation = int(src_tc[2])
558		src_romancode = int(src_tc[3:6])
559	1	src_len_fn = int(src_tc[6:8])
560		src_tc = src_tc.split('$')
561		src_specials = _split_special(src_tc[1])
562	1
563		tar_generation = int(tar_tc[2])
564		tar_romancode = int(tar_tc[3:6])
565		tar_len_fn = int(tar_tc[6:8])
566	1	tar_tc = tar_tc.split('$')
567	1	tar_specials = _split_special(tar_tc[1])
568	1
569	1	gen_conflict = (src_generation != tar_generation) and bool(
570	1	src_generation or tar_generation
571		)
572	1	roman_conflict = (src_romancode != tar_romancode) and bool(
573	1	src_romancode or tar_romancode
574	1	)
575	1
576	1	ln_equal = src_ln == tar_ln
577		fn_equal = src_fn == tar_fn
578	1
579		# approx_c
580		def _approx_c():
581	1	if gen_conflict or roman_conflict:
582		return False, 0
583
584		full_src = ' '.join((src_ln, src_fn))
585	1	if full_src.startswith('master '):
586	1	full_src = full_src[len('master ') :]
587		for intro in [
588		'of the ',
589	1	'of ',
590	1	'known as the ',
591	1	'with the ',
592		'with ',
593	1	]:
594	1	if full_src.startswith(intro):
595	1	full_src = full_src[len(intro) :]
596	1
597		full_tar = ' '.join((tar_ln, tar_fn))
598		if full_tar.startswith('master '):
599		full_tar = full_tar[len('master ') :]
600		for intro in [
601		'of the ',
602		'of ',
603	1	'known as the ',
604	1	'with the ',
605		'with ',
606	1	]:
607	1	if full_tar.startswith(intro):
608	1	full_tar = full_tar[len(intro) :]
609	1
610		loc_ratio = self._ratcliff_obershelp.sim(full_src, full_tar)
611		return loc_ratio >= self._char_approx_min, loc_ratio
612
613		approx_c_result, ca_ratio = _approx_c()
614
615		if self._tests & self._test_dict['exact'] and fn_equal and ln_equal:
616	1	return _fmt_retval(self._match_type_dict['exact'])
617	1	if self._tests & self._test_dict['omission']:
618		self._lev._cost = (1, 1, 99, 99) # noqa: SF01
619	1	self._lev._mode = 'lev' # noqa: SF01
620	1	if fn_equal and self._lev.dist_abs(src_ln, tar_ln) == 1:
621		if not roman_conflict:
622	1	return _fmt_retval(self._match_type_dict['omission'])
623		elif ln_equal and self._lev.dist_abs(src_fn, tar_fn) == 1:
624	1	return _fmt_retval(self._match_type_dict['omission'])
625	1	if self._tests & self._test_dict['substitution']:
626	1	self._lev._cost = (99, 99, 1, 99) # noqa: SF01
627	1	self._lev._mode = 'lev' # noqa: SF01
628		if fn_equal and self._lev.dist_abs(src_ln, tar_ln) == 1:
629		return _fmt_retval(self._match_type_dict['substitution'])
630		elif ln_equal and self._lev.dist_abs(src_fn, tar_fn) == 1:
631	1	return _fmt_retval(self._match_type_dict['substitution'])
632	1	if self._tests & self._test_dict['transposition']:
633	1	self._lev._cost = (99, 99, 99, 1) # noqa: SF01
634		self._lev._mode = 'osa' # noqa: SF01
635		if fn_equal and (self._lev.dist_abs(src_ln, tar_ln) == 1):
636		return _fmt_retval(self._match_type_dict['transposition'])
637	1	elif ln_equal and (self._lev.dist_abs(src_fn, tar_fn) == 1):
638	1	return _fmt_retval(self._match_type_dict['transposition'])
639	1	if self._tests & self._test_dict['punctuation']:
640		np_src_fn = self._synoname_strip_punct(src_fn)
641		np_tar_fn = self._synoname_strip_punct(tar_fn)
642		np_src_ln = self._synoname_strip_punct(src_ln)
643	1	np_tar_ln = self._synoname_strip_punct(tar_ln)
644	1
645		if (np_src_fn == np_tar_fn) and (np_src_ln == np_tar_ln):
646		return _fmt_retval(self._match_type_dict['punctuation'])
647
648	1	np_src_fn = self._synoname_strip_punct(src_fn.replace('-', ' '))
649	1	np_tar_fn = self._synoname_strip_punct(tar_fn.replace('-', ' '))
650	1	np_src_ln = self._synoname_strip_punct(src_ln.replace('-', ' '))
651		np_tar_ln = self._synoname_strip_punct(tar_ln.replace('-', ' '))
652
653		if (np_src_fn == np_tar_fn) and (np_src_ln == np_tar_ln):
654	1	return _fmt_retval(self._match_type_dict['punctuation'])
655	1
656		if self._tests & self._test_dict['initials'] and ln_equal:
657		if src_fn and tar_fn:
658		src_initials = self._synoname_strip_punct(src_fn).split()
659	1	tar_initials = self._synoname_strip_punct(tar_fn).split()
660	1	initials = bool(
661	1	(len(src_initials) == len(''.join(src_initials)))
662	1	or (len(tar_initials) == len(''.join(tar_initials)))
663	1	)
664	1	if initials:
665		src_initials = ''.join(_[0] for _ in src_initials)
666	1	tar_initials = ''.join(_[0] for _ in tar_initials)
667	1	if src_initials == tar_initials:
668		return _fmt_retval(self._match_type_dict['initials'])
669	1	initial_diff = abs(len(src_initials) - len(tar_initials))
670	1	self._lev._cost = (1, 99, 99, 99) # noqa: SF01
671	1	self._lev._mode = 'lev' # noqa: SF01
672	1	if initial_diff and (
673		(
674	1	initial_diff
675	1	== self._lev.dist_abs(src_initials, tar_initials,)
676		)
677	1	or (
678	1	initial_diff
679	1	== self._lev.dist_abs(tar_initials, src_initials,)
680	1	)
681	1	):
682		return _fmt_retval(self._match_type_dict['initials'])
683		if self._tests & self._test_dict['extension']:
684		if src_ln[1:2] == tar_ln[1:2] and (
685	1	src_ln.startswith(tar_ln) or tar_ln.startswith(src_ln)
686	1	):
687	1	if (
688	1	(not src_len_fn and not tar_len_fn)
689	1	or (tar_fn and src_fn.startswith(tar_fn))
690	1	or (src_fn and tar_fn.startswith(src_fn))
691	1	) and not roman_conflict:
692		return _fmt_retval(self._match_type_dict['extension'])
693		if self._tests & self._test_dict['inclusion'] and ln_equal:
694		if (src_fn and src_fn in tar_fn) or (tar_fn and tar_fn in src_ln):
695		return _fmt_retval(self._match_type_dict['inclusion'])
696		if self._tests & self._test_dict['no_first'] and ln_equal:
697		if src_fn == '' or tar_fn == '':
698		return _fmt_retval(self._match_type_dict['no_first'])
699		if self._tests & self._test_dict['word_approx']:
700		ratio = self._synoname_word_approximation(
701		src_ln,
702		tar_ln,
703		src_fn,
704		tar_fn,
705		{
706		'gen_conflict': gen_conflict,
707		'roman_conflict': roman_conflict,
708		'src_specials': src_specials,
709	1	'tar_specials': tar_specials,
710	1	},
711	1	)
712		if ratio == 1 and self._tests & self._test_dict['confusions']:
713		if (
714	1	' '.join((src_fn, src_ln)).strip()
715		== ' '.join((tar_fn, tar_ln)).strip()
716		):
717		return _fmt_retval(self._match_type_dict['confusions'])
718		if ratio >= self._word_approx_min:
719	1	return _fmt_retval(self._match_type_dict['word_approx'])
720	1	if self._tests & self._test_dict['char_approx']:
721	1	if ca_ratio >= self._char_approx_min:
722	1	return _fmt_retval(self._match_type_dict['char_approx'])
723	1	return _fmt_retval(self._match_type_dict['no_match'])
724	1
725	1	def dist(self, src, tar):
726	1	"""Return the normalized Synoname distance between two words.
727	1
728		Parameters
729		----------
730		src : str
731		Source string for comparison
732		tar : str
733		Target string for comparison
734
735		Returns
736		-------
737		float
738		Normalized Synoname distance
739	1
740	1
741		.. versionadded:: 0.3.0
742		.. versionchanged:: 0.3.6
743		Encapsulated in class
744	1
745	1	"""
746	1	return self.dist_abs(src, tar, force_numeric=True) / 14
747	1
748	1
749	1	if __name__ == '__main__':
750	1	import doctest
751
752		doctest.testmod()
753

chrislit / abydos

Push — master ( c2a3b6...15a61d )

abydos.distance._synoname.synoname() A

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like