abydos.distance._synoname.Synoname.dist() - Code Metrics - Inspection of "started new entry in HISTORY for 0.4.0" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Test Failed

Push — master ( 23810f...afe14d )

by Chris

created 2019-06-01 00:50 UTC

abydos.distance._synoname.Synoname.dist() A

↳ Parent: abydos.distance._synoname

Complexity

Conditions

Size

Total Lines	35
Code Lines	10

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	2
CRAP Score	1

Importance

Changes

Metric	Value
cc	1
eloc	10
nop	6
dl	0
loc	35
ccs	2
cts	2
cp	1
crap	1
rs	9.9
c	0
b	0
f	0

# -*- coding: utf-8 -*-

# Copyright 2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._synoname.

Synoname.
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from collections import Iterable

from ._distance import _Distance
from ._levenshtein import levenshtein
from ._ratcliff_obershelp import sim_ratcliff_obershelp

# noinspection PyProtectedMember
from ..fingerprint._synoname import SynonameToolcode

__all__ = ['Synoname', 'synoname']


class Synoname(_Distance):
    """Synoname.

    Cf. :cite:`Getty:1991,Gross:1991`
    """

    _stc = SynonameToolcode()

    _test_dict = {
        val: 2 ** n
        for n, val in enumerate(
            (
                'exact',
                'omission',
                'substitution',
                'transposition',
                'punctuation',
                'initials',
                'extension',
                'inclusion',
                'no_first',
                'word_approx',
                'confusions',
                'char_approx',
            )
        )
    }
    _match_name = (
        '',
        'exact',
        'omission',
        'substitution',
        'transposition',
        'punctuation',
        'initials',
        'extension',
        'inclusion',
        'no_first',
        'word_approx',
        'confusions',
        'char_approx',
        'no_match',
    )
    _match_type_dict = {val: n for n, val in enumerate(_match_name)}

    def _synoname_strip_punct(self, word):
        """Return a word with punctuation stripped out.

        Parameters
        ----------
        word : str
            A word to strip punctuation from

        Returns
        -------
        str
            The word stripped of punctuation

        Examples
        --------
        >>> pe = Synoname()
        >>> pe._synoname_strip_punct('AB;CD EF-GH$IJ')
        'ABCD EFGHIJ'

        """
        stripped = ''
        for char in word:
            if char not in set(',-./:;"&\'()!{|}?$%*+<=>[\\]^_`~'):
                stripped += char
        return stripped.strip()

    def _synoname_word_approximation(
        self, src_ln, tar_ln, src_fn='', tar_fn='', features=None
    ):
        """Return the Synoname word approximation score for two names.

        Parameters
        ----------
        src_ln : str
            Last name of the source
        tar_ln : str
            Last name of the target
        src_fn : str
            First name of the source (optional)
        tar_fn : str
            First name of the target (optional)
        features : dict
            A dict containing special features calculated using
            :py:class:`fingerprint.SynonameToolcode` (optional)

        Returns
        -------
        float
            The word approximation score

        Examples
        --------
        >>> pe = Synoname()
        >>> pe._synoname_word_approximation('Smith Waterman', 'Waterman',
        ... 'Tom Joe Bob', 'Tom Joe')
        0.6

        """
        if features is None:
            features = {}
        if 'src_specials' not in features:
            features['src_specials'] = []
        if 'tar_specials' not in features:
            features['tar_specials'] = []

        src_len_specials = len(features['src_specials'])
        tar_len_specials = len(features['tar_specials'])

        # 1
        if ('gen_conflict' in features and features['gen_conflict']) or (
            'roman_conflict' in features and features['roman_conflict']
        ):
            return 0

        # 3 & 7
        full_tar1 = ' '.join((tar_ln, tar_fn)).replace('-', ' ').strip()
        for s_pos, s_type in features['tar_specials']:
            if s_type == 'a':
                full_tar1 = full_tar1[
                    : -(
                        1
                        + len(
                            self._stc._synoname_special_table[  # noqa: SF01
                                s_pos
                            ][1]
                        )
                    )
                ]
            elif s_type == 'b':
                loc = (
                    full_tar1.find(
                        ' '
                        + self._stc._synoname_special_table[  # noqa: SF01
                            s_pos
                        ][1]
                        + ' '
                    )
                    + 1
                )
                full_tar1 = (
                    full_tar1[:loc]
                    + full_tar1[
                        loc
                        + len(
                            self._stc._synoname_special_table[  # noqa: SF01
                                s_pos
                            ][1]
                        ) :
                    ]
                )
            elif s_type == 'c':
                full_tar1 = full_tar1[
                    1
                    + len(
                        self._stc._synoname_special_table[s_pos][  # noqa: SF01
                            1
                        ]
                    ) :
                ]

        full_src1 = ' '.join((src_ln, src_fn)).replace('-', ' ').strip()
        for s_pos, s_type in features['src_specials']:
            if s_type == 'a':
                full_src1 = full_src1[
                    : -(
                        1
                        + len(
                            self._stc._synoname_special_table[  # noqa: SF01
                                s_pos
                            ][1]
                        )
                    )
                ]
            elif s_type == 'b':
                loc = (
                    full_src1.find(
                        ' '
                        + self._stc._synoname_special_table[  # noqa: SF01
                            s_pos
                        ][1]
                        + ' '
                    )
                    + 1
                )
                full_src1 = (
                    full_src1[:loc]
                    + full_src1[
                        loc
                        + len(
                            self._stc._synoname_special_table[  # noqa: SF01
                                s_pos
                            ][1]
                        ) :
                    ]
                )
            elif s_type == 'c':
                full_src1 = full_src1[
                    1
                    + len(
                        self._stc._synoname_special_table[s_pos][  # noqa: SF01
                            1
                        ]
                    ) :
                ]

        full_tar2 = full_tar1
        for s_pos, s_type in features['tar_specials']:
            if s_type == 'd':
                full_tar2 = full_tar2[
                    len(
                        self._stc._synoname_special_table[s_pos][  # noqa: SF01
                            1
                        ]
                    ) :
                ]
            elif (
                s_type == 'X'
                and self._stc._synoname_special_table[s_pos][1]  # noqa: SF01
                in full_tar2
            ):
                loc = full_tar2.find(
                    ' '
                    + self._stc._synoname_special_table[s_pos][1]  # noqa: SF01
                )
                full_tar2 = (
                    full_tar2[:loc]
                    + full_tar2[
                        loc
                        + len(
                            self._stc._synoname_special_table[  # noqa: SF01
                                s_pos
                            ][1]
                        ) :
                    ]
                )

        full_src2 = full_src1
        for s_pos, s_type in features['src_specials']:
            if s_type == 'd':
                full_src2 = full_src2[
                    len(
                        self._stc._synoname_special_table[s_pos][  # noqa: SF01
                            1
                        ]
                    ) :
                ]
            elif (
                s_type == 'X'
                and self._stc._synoname_special_table[s_pos][1]  # noqa: SF01
                in full_src2
            ):
                loc = full_src2.find(
                    ' '
                    + self._stc._synoname_special_table[s_pos][1]  # noqa: SF01
                )
                full_src2 = (
                    full_src2[:loc]
                    + full_src2[
                        loc
                        + len(
                            self._stc._synoname_special_table[  # noqa: SF01
                                s_pos
                            ][1]
                        ) :
                    ]
                )

        full_tar1 = self._synoname_strip_punct(full_tar1)
        tar1_words = full_tar1.split()
        tar1_num_words = len(tar1_words)

        full_src1 = self._synoname_strip_punct(full_src1)
        src1_words = full_src1.split()
        src1_num_words = len(src1_words)

        full_tar2 = self._synoname_strip_punct(full_tar2)
        tar2_words = full_tar2.split()
        tar2_num_words = len(tar2_words)

        full_src2 = self._synoname_strip_punct(full_src2)
        src2_words = full_src2.split()
        src2_num_words = len(src2_words)

        # 2
        if (
            src1_num_words < 2
            and src_len_specials == 0
            and src2_num_words < 2
            and tar_len_specials == 0
        ):
            return 0

        # 4
        if (
            tar1_num_words == 1
            and src1_num_words == 1
            and tar1_words[0] == src1_words[0]
        ):
            return 1
        if tar1_num_words < 2 and tar_len_specials == 0:
            return 0

        # 5
        last_found = False
        for word in tar1_words:
            if src_ln.endswith(word) or word + ' ' in src_ln:
                last_found = True

        if not last_found:
            for word in src1_words:
                if tar_ln.endswith(word) or word + ' ' in tar_ln:
                    last_found = True

        # 6
        matches = 0
        if last_found:
            for i, s_word in enumerate(src1_words):
                for j, t_word in enumerate(tar1_words):
                    if s_word == t_word:
                        src1_words[i] = '@'
                        tar1_words[j] = '@'
                        matches += 1
        w_ratio = matches / max(tar1_num_words, src1_num_words)
        if matches > 1 or (
            matches == 1
            and src1_num_words == 1
            and tar1_num_words == 1
            and (tar_len_specials > 0 or src_len_specials > 0)
        ):
            return w_ratio

        # 8
        if (
            tar2_num_words == 1
            and src2_num_words == 1
            and tar2_words[0] == src2_words[0]
        ):
            return 1
        # I see no way that the following can be True if the equivalent in
        # #4 was False.
        if tar2_num_words < 2 and tar_len_specials == 0:  # pragma: no cover
            return 0

        # 9
        last_found = False
        for word in tar2_words:
            if src_ln.endswith(word) or word + ' ' in src_ln:
                last_found = True

        if not last_found:
            for word in src2_words:
                if tar_ln.endswith(word) or word + ' ' in tar_ln:
                    last_found = True

        if not last_found:
            return 0

        # 10
        matches = 0
        if last_found:
            for i, s_word in enumerate(src2_words):
                for j, t_word in enumerate(tar2_words):
                    if s_word == t_word:
                        src2_words[i] = '@'
                        tar2_words[j] = '@'
                        matches += 1
        w_ratio = matches / max(tar2_num_words, src2_num_words)
        if matches > 1 or (
            matches == 1
            and src2_num_words == 1
            and tar2_num_words == 1
            and (tar_len_specials > 0 or src_len_specials > 0)
        ):
            return w_ratio

        return 0

    def dist_abs(
        self,
        src,
        tar,
        word_approx_min=0.3,
        char_approx_min=0.73,
        tests=2 ** 12 - 1,
        ret_name=False,
    ):
        """Return the Synoname similarity type of two words.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison
        word_approx_min : float
            The minimum word approximation value to signal a 'word_approx'
            match
        char_approx_min : float
            The minimum character approximation value to signal a 'char_approx'
            match
        tests : int or Iterable
            Either an integer indicating tests to perform or a list of test
            names to perform (defaults to performing all tests)
        ret_name : bool
            If True, returns the match name rather than its integer equivalent

        Returns
        -------
        int (or str if ret_name is True)
            Synoname value

        Examples
        --------
        >>> cmp = Synoname()
        >>> cmp.dist_abs(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''))
        2
        >>> cmp.dist_abs(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''),
        ... ret_name=True)
        'omission'
        >>> cmp.dist_abs(('Dore', 'Gustave', ''),
        ... ('Dore', 'Paul Gustave Louis Christophe', ''), ret_name=True)
        'inclusion'
        >>> cmp.dist_abs(('Pereira', 'I. R.', ''), ('Pereira', 'I. Smith', ''),
        ... ret_name=True)
        'word_approx'

        """
        if isinstance(tests, Iterable):
            new_tests = 0
            for term in tests:
                if term in self._test_dict:
                    new_tests += self._test_dict[term]
            tests = new_tests

        if isinstance(src, tuple):
            src_ln, src_fn, src_qual = src
        elif '#' in src:
            src_ln, src_fn, src_qual = src.split('#')[-3:]
        else:
            src_ln, src_fn, src_qual = src, '', ''

        if isinstance(tar, tuple):
            tar_ln, tar_fn, tar_qual = tar
        elif '#' in tar:
            tar_ln, tar_fn, tar_qual = tar.split('#')[-3:]
        else:
            tar_ln, tar_fn, tar_qual = tar, '', ''

        def _split_special(spec):
            spec_list = []
            while spec:
                spec_list.append((int(spec[:3]), spec[3:4]))
                spec = spec[4:]
            return spec_list

        def _fmt_retval(val):
            if ret_name:
                return self._match_name[val]
            return val

        # 1. Preprocessing

        # Lowercasing
        src_fn = src_fn.strip().lower()
        src_ln = src_ln.strip().lower()
        src_qual = src_qual.strip().lower()

        tar_fn = tar_fn.strip().lower()
        tar_ln = tar_ln.strip().lower()
        tar_qual = tar_qual.strip().lower()

        # Create toolcodes
        src_ln, src_fn, src_tc = self._stc.fingerprint(
            src_ln, src_fn, src_qual
        )
        tar_ln, tar_fn, tar_tc = self._stc.fingerprint(
            tar_ln, tar_fn, tar_qual
        )

        src_generation = int(src_tc[2])
        src_romancode = int(src_tc[3:6])
        src_len_fn = int(src_tc[6:8])
        src_tc = src_tc.split('$')
        src_specials = _split_special(src_tc[1])

        tar_generation = int(tar_tc[2])
        tar_romancode = int(tar_tc[3:6])
        tar_len_fn = int(tar_tc[6:8])
        tar_tc = tar_tc.split('$')
        tar_specials = _split_special(tar_tc[1])

        gen_conflict = (src_generation != tar_generation) and bool(
            src_generation or tar_generation
        )
        roman_conflict = (src_romancode != tar_romancode) and bool(
            src_romancode or tar_romancode
        )

        ln_equal = src_ln == tar_ln
        fn_equal = src_fn == tar_fn

        # approx_c
        def _approx_c():
            if gen_conflict or roman_conflict:
                return False, 0

            full_src = ' '.join((src_ln, src_fn))
            if full_src.startswith('master '):
                full_src = full_src[len('master ') :]
                for intro in [
                    'of the ',
                    'of ',
                    'known as the ',
                    'with the ',
                    'with ',
                ]:
                    if full_src.startswith(intro):
                        full_src = full_src[len(intro) :]

            full_tar = ' '.join((tar_ln, tar_fn))
            if full_tar.startswith('master '):
                full_tar = full_tar[len('master ') :]
                for intro in [
                    'of the ',
                    'of ',
                    'known as the ',
                    'with the ',
                    'with ',
                ]:
                    if full_tar.startswith(intro):
                        full_tar = full_tar[len(intro) :]

            loc_ratio = sim_ratcliff_obershelp(full_src, full_tar)
            return loc_ratio >= char_approx_min, loc_ratio

        approx_c_result, ca_ratio = _approx_c()

        if tests & self._test_dict['exact'] and fn_equal and ln_equal:
            return _fmt_retval(self._match_type_dict['exact'])
        if tests & self._test_dict['omission']:

            if (
                fn_equal
                and levenshtein(src_ln, tar_ln, cost=(1, 1, 99, 99)) == 1
            ):
                if not roman_conflict:
                    return _fmt_retval(self._match_type_dict['omission'])
            elif (
                ln_equal
                and levenshtein(src_fn, tar_fn, cost=(1, 1, 99, 99)) == 1
            ):
                return _fmt_retval(self._match_type_dict['omission'])
        if tests & self._test_dict['substitution']:

            if (
                fn_equal
                and levenshtein(src_ln, tar_ln, cost=(99, 99, 1, 99)) == 1
            ):
                return _fmt_retval(self._match_type_dict['substitution'])
            elif (
                ln_equal
                and levenshtein(src_fn, tar_fn, cost=(99, 99, 1, 99)) == 1
            ):
                return _fmt_retval(self._match_type_dict['substitution'])
        if tests & self._test_dict['transposition']:

            if fn_equal and (
                levenshtein(src_ln, tar_ln, mode='osa', cost=(99, 99, 99, 1))
                == 1
            ):
                return _fmt_retval(self._match_type_dict['transposition'])
            elif ln_equal and (
                levenshtein(src_fn, tar_fn, mode='osa', cost=(99, 99, 99, 1))
                == 1
            ):
                return _fmt_retval(self._match_type_dict['transposition'])
        if tests & self._test_dict['punctuation']:
            np_src_fn = self._synoname_strip_punct(src_fn)
            np_tar_fn = self._synoname_strip_punct(tar_fn)
            np_src_ln = self._synoname_strip_punct(src_ln)
            np_tar_ln = self._synoname_strip_punct(tar_ln)

            if (np_src_fn == np_tar_fn) and (np_src_ln == np_tar_ln):
                return _fmt_retval(self._match_type_dict['punctuation'])

            np_src_fn = self._synoname_strip_punct(src_fn.replace('-', ' '))
            np_tar_fn = self._synoname_strip_punct(tar_fn.replace('-', ' '))
            np_src_ln = self._synoname_strip_punct(src_ln.replace('-', ' '))
            np_tar_ln = self._synoname_strip_punct(tar_ln.replace('-', ' '))

            if (np_src_fn == np_tar_fn) and (np_src_ln == np_tar_ln):
                return _fmt_retval(self._match_type_dict['punctuation'])

        if tests & self._test_dict['initials'] and ln_equal:
            if src_fn and tar_fn:
                src_initials = self._synoname_strip_punct(src_fn).split()
                tar_initials = self._synoname_strip_punct(tar_fn).split()
                initials = bool(
                    (len(src_initials) == len(''.join(src_initials)))
                    or (len(tar_initials) == len(''.join(tar_initials)))
                )
                if initials:
                    src_initials = ''.join(_[0] for _ in src_initials)
                    tar_initials = ''.join(_[0] for _ in tar_initials)
                    if src_initials == tar_initials:
                        return _fmt_retval(self._match_type_dict['initials'])
                    initial_diff = abs(len(src_initials) - len(tar_initials))
                    if initial_diff and (
                        (
                            initial_diff
                            == levenshtein(
                                src_initials,
                                tar_initials,
                                cost=(1, 99, 99, 99),
                            )
                        )
                        or (
                            initial_diff
                            == levenshtein(
                                tar_initials,
                                src_initials,
                                cost=(1, 99, 99, 99),
                            )
                        )
                    ):
                        return _fmt_retval(self._match_type_dict['initials'])
        if tests & self._test_dict['extension']:
            if src_ln[1] == tar_ln[1] and (
                src_ln.startswith(tar_ln) or tar_ln.startswith(src_ln)
            ):
                if (
                    (not src_len_fn and not tar_len_fn)
                    or (tar_fn and src_fn.startswith(tar_fn))
                    or (src_fn and tar_fn.startswith(src_fn))
                ) and not roman_conflict:
                    return _fmt_retval(self._match_type_dict['extension'])
        if tests & self._test_dict['inclusion'] and ln_equal:
            if (src_fn and src_fn in tar_fn) or (tar_fn and tar_fn in src_ln):
                return _fmt_retval(self._match_type_dict['inclusion'])
        if tests & self._test_dict['no_first'] and ln_equal:
            if src_fn == '' or tar_fn == '':
                return _fmt_retval(self._match_type_dict['no_first'])
        if tests & self._test_dict['word_approx']:
            ratio = self._synoname_word_approximation(
                src_ln,
                tar_ln,
                src_fn,
                tar_fn,
                {
                    'gen_conflict': gen_conflict,
                    'roman_conflict': roman_conflict,
                    'src_specials': src_specials,
                    'tar_specials': tar_specials,
                },
            )
            if ratio == 1 and tests & self._test_dict['confusions']:
                if (
                    ' '.join((src_fn, src_ln)).strip()
                    == ' '.join((tar_fn, tar_ln)).strip()
                ):
                    return _fmt_retval(self._match_type_dict['confusions'])
            if ratio >= word_approx_min:
                return _fmt_retval(self._match_type_dict['word_approx'])
        if tests & self._test_dict['char_approx']:
            if ca_ratio >= char_approx_min:
                return _fmt_retval(self._match_type_dict['char_approx'])
        return _fmt_retval(self._match_type_dict['no_match'])

    def dist(
        self,
        src,
        tar,
        word_approx_min=0.3,
        char_approx_min=0.73,
        tests=2 ** 12 - 1,
    ):
        """Return the normalized Synoname distance between two words.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison
        word_approx_min : float
            The minimum word approximation value to signal a 'word_approx'
            match
        char_approx_min : float
            The minimum character approximation value to signal a 'char_approx'
            match
        tests : int or Iterable
            Either an integer indicating tests to perform or a list of test
            names to perform (defaults to performing all tests)

        Returns
        -------
        float
            Normalized Synoname distance

        """
        return (
            synoname(src, tar, word_approx_min, char_approx_min, tests, False)
            / 14
        )


def synoname(
    src,
    tar,
    word_approx_min=0.3,
    char_approx_min=0.73,
    tests=2 ** 12 - 1,
    ret_name=False,
):
    """Return the Synoname similarity type of two words.

    This is a wrapper for :py:meth:`Synoname.dist_abs`.

    Parameters
    ----------
    src : str
        Source string for comparison
    tar : str
        Target string for comparison
    word_approx_min : float
        The minimum word approximation value to signal a 'word_approx' match
    char_approx_min : float
        The minimum character approximation value to signal a 'char_approx'
        match
    tests : int or Iterable
        Either an integer indicating tests to perform or a list of test names
        to perform (defaults to performing all tests)
    ret_name : bool
        If True, returns the match name rather than its integer equivalent

    Returns
    -------
    int (or str if ret_name is True)
        Synoname value

    Examples
    --------
    >>> synoname(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''))
    2
    >>> synoname(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''),
    ... ret_name=True)
    'omission'
    >>> synoname(('Dore', 'Gustave', ''),
    ... ('Dore', 'Paul Gustave Louis Christophe', ''), ret_name=True)
    'inclusion'
    >>> synoname(('Pereira', 'I. R.', ''), ('Pereira', 'I. Smith', ''),
    ... ret_name=True)
    'word_approx'

    """
    return Synoname().dist_abs(
        src, tar, word_approx_min, char_approx_min, tests, ret_name
    )


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1			# -- coding: utf-8 --
2
3			# Copyright 2018 by Christopher C. Little.
4			# This file is part of Abydos.
5			#
6			# Abydos is free software: you can redistribute it and/or modify
7			# it under the terms of the GNU General Public License as published by
8			# the Free Software Foundation, either version 3 of the License, or
9			# (at your option) any later version.
10			#
11			# Abydos is distributed in the hope that it will be useful,
12			# but WITHOUT ANY WARRANTY; without even the implied warranty of
13			# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14			# GNU General Public License for more details.
15			#
16			# You should have received a copy of the GNU General Public License
17			# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1		"""abydos.distance._synoname.
20
21			Synoname.
22			"""
23
24	1		from __future__ import (
25			absolute_import,
26			division,
27			print_function,
28			unicode_literals,
29			)
30
31	1		from collections import Iterable
32
33	1		from ._distance import _Distance
34	1		from ._levenshtein import levenshtein
35	1		from ._ratcliff_obershelp import sim_ratcliff_obershelp
36
37			# noinspection PyProtectedMember
38	1		from ..fingerprint._synoname import SynonameToolcode
39
40	1		__all__ = ['Synoname', 'synoname']
41
42
43	1		class Synoname(_Distance):
44			"""Synoname.
45
46			Cf. :cite:`Getty:1991,Gross:1991`
47			"""
48
49	1		_stc = SynonameToolcode()
50
51	1		_test_dict = {
52			val: 2 ** n
53			for n, val in enumerate(
54			(
55			'exact',
56			'omission',
57			'substitution',
58			'transposition',
59			'punctuation',
60			'initials',
61			'extension',
62			'inclusion',
63			'no_first',
64			'word_approx',
65			'confusions',
66			'char_approx',
67			)
68			)
69			}
70	1		_match_name = (
71			'',
72			'exact',
73			'omission',
74			'substitution',
75			'transposition',
76			'punctuation',
77			'initials',
78			'extension',
79			'inclusion',
80			'no_first',
81			'word_approx',
82			'confusions',
83			'char_approx',
84			'no_match',
85			)
86	1		_match_type_dict = {val: n for n, val in enumerate(_match_name)}
87
88	1		def _synoname_strip_punct(self, word):
89			"""Return a word with punctuation stripped out.
90
91			Parameters
92			----------
93			word : str
94			A word to strip punctuation from
95
96			Returns
97			-------
98			str
99			The word stripped of punctuation
100
101			Examples
102			--------
103			>>> pe = Synoname()
104			>>> pe._synoname_strip_punct('AB;CD EF-GH$IJ')
105			'ABCD EFGHIJ'
106
107			"""
108	1		stripped = ''
109	1		for char in word:
110	1		if char not in set(',-./:;"&\'()!{\|}?$%*+<=>[\\]^_`~'):
111	1		stripped += char
112	1		return stripped.strip()
113
114	1		def _synoname_word_approximation(
115			self, src_ln, tar_ln, src_fn='', tar_fn='', features=None
116			):
117			"""Return the Synoname word approximation score for two names.
118
119			Parameters
120			----------
121			src_ln : str
122			Last name of the source
123			tar_ln : str
124			Last name of the target
125			src_fn : str
126			First name of the source (optional)
127			tar_fn : str
128			First name of the target (optional)
129			features : dict
130			A dict containing special features calculated using
131			:py:class:`fingerprint.SynonameToolcode` (optional)
132
133			Returns
134			-------
135			float
136			The word approximation score
137
138			Examples
139			--------
140			>>> pe = Synoname()
141			>>> pe._synoname_word_approximation('Smith Waterman', 'Waterman',
142			... 'Tom Joe Bob', 'Tom Joe')
143			0.6
144
145			"""
146	1		if features is None:
147	1		features = {}
148	1		if 'src_specials' not in features:
149	1		features['src_specials'] = []
150	1		if 'tar_specials' not in features:
151	1		features['tar_specials'] = []
152
153	1		src_len_specials = len(features['src_specials'])
154	1		tar_len_specials = len(features['tar_specials'])
155
156			# 1
157	1		if ('gen_conflict' in features and features['gen_conflict']) or (
158			'roman_conflict' in features and features['roman_conflict']
159			):
160	1		return 0
161
162			# 3 & 7
163	1		full_tar1 = ' '.join((tar_ln, tar_fn)).replace('-', ' ').strip()
164	1		for s_pos, s_type in features['tar_specials']:
165	1		if s_type == 'a':
166	1		full_tar1 = full_tar1[
167			: -(
168			1
169			+ len(
170			self._stc._synoname_special_table[ # noqa: SF01
171			s_pos
172			][1]
173			)
174			)
175			]
176	1		elif s_type == 'b':
177	1		loc = (
178			full_tar1.find(
179			' '
180			+ self._stc._synoname_special_table[ # noqa: SF01
181			s_pos
182			][1]
183			+ ' '
184			)
185			+ 1
186			)
187	1		full_tar1 = (
188			full_tar1[:loc]
189			+ full_tar1[
190			loc
191			+ len(
192			self._stc._synoname_special_table[ # noqa: SF01
193			s_pos
194			][1]
195			) :
196			]
197			)
198	1		elif s_type == 'c':
199	1		full_tar1 = full_tar1[
200			1
201			+ len(
202			self._stc._synoname_special_table[s_pos][ # noqa: SF01
203			1
204			]
205			) :
206			]
207
208	1		full_src1 = ' '.join((src_ln, src_fn)).replace('-', ' ').strip()
209	1		for s_pos, s_type in features['src_specials']:
210	1		if s_type == 'a':
211	1		full_src1 = full_src1[
212			: -(
213			1
214			+ len(
215			self._stc._synoname_special_table[ # noqa: SF01
216			s_pos
217			][1]
218			)
219			)
220			]
221	1		elif s_type == 'b':
222	1		loc = (
223			full_src1.find(
224			' '
225			+ self._stc._synoname_special_table[ # noqa: SF01
226			s_pos
227			][1]
228			+ ' '
229			)
230			+ 1
231			)
232	1		full_src1 = (
233			full_src1[:loc]
234			+ full_src1[
235			loc
236			+ len(
237			self._stc._synoname_special_table[ # noqa: SF01
238			s_pos
239			][1]
240			) :
241			]
242			)
243	1		elif s_type == 'c':
244	1		full_src1 = full_src1[
245			1
246			+ len(
247			self._stc._synoname_special_table[s_pos][ # noqa: SF01
248			1
249			]
250			) :
251			]
252
253	1		full_tar2 = full_tar1
254	1		for s_pos, s_type in features['tar_specials']:
255	1		if s_type == 'd':
256	1		full_tar2 = full_tar2[
257			len(
258			self._stc._synoname_special_table[s_pos][ # noqa: SF01
259			1
260			]
261			) :
262			]
263	1		elif (
264			s_type == 'X'
265			and self._stc._synoname_special_table[s_pos][1] # noqa: SF01
266			in full_tar2
267			):
268	1		loc = full_tar2.find(
269			' '
270			+ self._stc._synoname_special_table[s_pos][1] # noqa: SF01
271			)
272	1		full_tar2 = (
273			full_tar2[:loc]
274			+ full_tar2[
275			loc
276			+ len(
277			self._stc._synoname_special_table[ # noqa: SF01
278			s_pos
279			][1]
280			) :
281			]
282			)
283
284	1		full_src2 = full_src1
285	1		for s_pos, s_type in features['src_specials']:
286	1		if s_type == 'd':
287	1		full_src2 = full_src2[
288			len(
289			self._stc._synoname_special_table[s_pos][ # noqa: SF01
290			1
291			]
292			) :
293			]
294	1		elif (
295			s_type == 'X'
296			and self._stc._synoname_special_table[s_pos][1] # noqa: SF01
297			in full_src2
298			):
299	1		loc = full_src2.find(
300			' '
301			+ self._stc._synoname_special_table[s_pos][1] # noqa: SF01
302			)
303	1		full_src2 = (
304			full_src2[:loc]
305			+ full_src2[
306			loc
307			+ len(
308			self._stc._synoname_special_table[ # noqa: SF01
309			s_pos
310			][1]
311			) :
312			]
313			)
314
315	1		full_tar1 = self._synoname_strip_punct(full_tar1)
316	1		tar1_words = full_tar1.split()
317	1		tar1_num_words = len(tar1_words)
318
319	1		full_src1 = self._synoname_strip_punct(full_src1)
320	1		src1_words = full_src1.split()
321	1		src1_num_words = len(src1_words)
322
323	1		full_tar2 = self._synoname_strip_punct(full_tar2)
324	1		tar2_words = full_tar2.split()
325	1		tar2_num_words = len(tar2_words)
326
327	1		full_src2 = self._synoname_strip_punct(full_src2)
328	1		src2_words = full_src2.split()
329	1		src2_num_words = len(src2_words)
330
331			# 2
332	1		if (
333			src1_num_words < 2
334			and src_len_specials == 0
335			and src2_num_words < 2
336			and tar_len_specials == 0
337			):
338	1		return 0
339
340			# 4
341	1		if (
342			tar1_num_words == 1
343			and src1_num_words == 1
344			and tar1_words[0] == src1_words[0]
345			):
346	1		return 1
347	1		if tar1_num_words < 2 and tar_len_specials == 0:
348	1		return 0
349
350			# 5
351	1		last_found = False
352	1		for word in tar1_words:
353	1		if src_ln.endswith(word) or word + ' ' in src_ln:
354	1		last_found = True
355
356	1		if not last_found:
357	1		for word in src1_words:
358	1		if tar_ln.endswith(word) or word + ' ' in tar_ln:
359	1		last_found = True
360
361			# 6
362	1		matches = 0
363	1		if last_found:
364	1		for i, s_word in enumerate(src1_words):
365	1		for j, t_word in enumerate(tar1_words):
366	1		if s_word == t_word:
367	1		src1_words[i] = '@'
368	1		tar1_words[j] = '@'
369	1		matches += 1
370	1		w_ratio = matches / max(tar1_num_words, src1_num_words)
371	1		if matches > 1 or (
372			matches == 1
373			and src1_num_words == 1
374			and tar1_num_words == 1
375			and (tar_len_specials > 0 or src_len_specials > 0)
376			):
377	1		return w_ratio
378
379			# 8
380	1		if (
381			tar2_num_words == 1
382			and src2_num_words == 1
383			and tar2_words[0] == src2_words[0]
384			):
385	1		return 1
386			# I see no way that the following can be True if the equivalent in
387			# #4 was False.
388			if tar2_num_words < 2 and tar_len_specials == 0: # pragma: no cover
389			return 0
390
391			# 9
392	1		last_found = False
393	1		for word in tar2_words:
394	1		if src_ln.endswith(word) or word + ' ' in src_ln:
395	1		last_found = True
396
397	1		if not last_found:
398	1		for word in src2_words:
399	1		if tar_ln.endswith(word) or word + ' ' in tar_ln:
400	1		last_found = True
401
402	1		if not last_found:
403	1		return 0
404
405			# 10
406	1		matches = 0
407	1		if last_found:
408	1		for i, s_word in enumerate(src2_words):
409	1		for j, t_word in enumerate(tar2_words):
410	1		if s_word == t_word:
411	1		src2_words[i] = '@'
412	1		tar2_words[j] = '@'
413	1		matches += 1
414	1		w_ratio = matches / max(tar2_num_words, src2_num_words)
415	1		if matches > 1 or (
416			matches == 1
417			and src2_num_words == 1
418			and tar2_num_words == 1
419			and (tar_len_specials > 0 or src_len_specials > 0)
420			):
421			return w_ratio
422
423	1		return 0
424
425	1		def dist_abs(
426			self,
427			src,
428			tar,
429			word_approx_min=0.3,
430			char_approx_min=0.73,
431			tests=2 ** 12 - 1,
432			ret_name=False,
433			):
434			"""Return the Synoname similarity type of two words.
435
436			Parameters
437			----------
438			src : str
439			Source string for comparison
440			tar : str
441			Target string for comparison
442			word_approx_min : float
443			The minimum word approximation value to signal a 'word_approx'
444			match
445			char_approx_min : float
446			The minimum character approximation value to signal a 'char_approx'
447			match
448			tests : int or Iterable
449			Either an integer indicating tests to perform or a list of test
450			names to perform (defaults to performing all tests)
451			ret_name : bool
452			If True, returns the match name rather than its integer equivalent
453
454			Returns
455			-------
456			int (or str if ret_name is True)
457			Synoname value
458
459			Examples
460			--------
461			>>> cmp = Synoname()
462			>>> cmp.dist_abs(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''))
463			2
464			>>> cmp.dist_abs(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''),
465			... ret_name=True)
466			'omission'
467			>>> cmp.dist_abs(('Dore', 'Gustave', ''),
468			... ('Dore', 'Paul Gustave Louis Christophe', ''), ret_name=True)
469			'inclusion'
470			>>> cmp.dist_abs(('Pereira', 'I. R.', ''), ('Pereira', 'I. Smith', ''),
471			... ret_name=True)
472			'word_approx'
473
474			"""
475	1		if isinstance(tests, Iterable):
476	1		new_tests = 0
477	1		for term in tests:
478	1		if term in self._test_dict:
479	1		new_tests += self._test_dict[term]
480	1		tests = new_tests
481
482	1		if isinstance(src, tuple):
483	1		src_ln, src_fn, src_qual = src
484	1		elif '#' in src:
485	1		src_ln, src_fn, src_qual = src.split('#')[-3:]
486			else:
487	1		src_ln, src_fn, src_qual = src, '', ''
488
489	1		if isinstance(tar, tuple):
490	1		tar_ln, tar_fn, tar_qual = tar
491	1		elif '#' in tar:
492	1		tar_ln, tar_fn, tar_qual = tar.split('#')[-3:]
493			else:
494	1		tar_ln, tar_fn, tar_qual = tar, '', ''
495
496	1		def _split_special(spec):
497	1		spec_list = []
498	1		while spec:
499	1		spec_list.append((int(spec[:3]), spec[3:4]))
500	1		spec = spec[4:]
501	1		return spec_list
502
503	1		def _fmt_retval(val):
504	1		if ret_name:
505	1		return self._match_name[val]
506	1		return val
507
508			# 1. Preprocessing
509
510			# Lowercasing
511	1		src_fn = src_fn.strip().lower()
512	1		src_ln = src_ln.strip().lower()
513	1		src_qual = src_qual.strip().lower()
514
515	1		tar_fn = tar_fn.strip().lower()
516	1		tar_ln = tar_ln.strip().lower()
517	1		tar_qual = tar_qual.strip().lower()
518
519			# Create toolcodes
520	1		src_ln, src_fn, src_tc = self._stc.fingerprint(
521			src_ln, src_fn, src_qual
522			)
523	1		tar_ln, tar_fn, tar_tc = self._stc.fingerprint(
524			tar_ln, tar_fn, tar_qual
525			)
526
527	1		src_generation = int(src_tc[2])
528	1		src_romancode = int(src_tc[3:6])
529	1		src_len_fn = int(src_tc[6:8])
530	1		src_tc = src_tc.split('$')
531	1		src_specials = _split_special(src_tc[1])
532
533	1		tar_generation = int(tar_tc[2])
534	1		tar_romancode = int(tar_tc[3:6])
535	1		tar_len_fn = int(tar_tc[6:8])
536	1		tar_tc = tar_tc.split('$')
537	1		tar_specials = _split_special(tar_tc[1])
538
539	1		gen_conflict = (src_generation != tar_generation) and bool(
540			src_generation or tar_generation
541			)
542	1		roman_conflict = (src_romancode != tar_romancode) and bool(
543			src_romancode or tar_romancode
544			)
545
546	1		ln_equal = src_ln == tar_ln
547	1		fn_equal = src_fn == tar_fn
548
549			# approx_c
550	1		def _approx_c():
551	1		if gen_conflict or roman_conflict:
552	1		return False, 0
553
554	1		full_src = ' '.join((src_ln, src_fn))
555	1		if full_src.startswith('master '):
556	1		full_src = full_src[len('master ') :]
557	1		for intro in [
558			'of the ',
559			'of ',
560			'known as the ',
561			'with the ',
562			'with ',
563			]:
564	1		if full_src.startswith(intro):
565	1		full_src = full_src[len(intro) :]
566
567	1		full_tar = ' '.join((tar_ln, tar_fn))
568	1		if full_tar.startswith('master '):
569	1		full_tar = full_tar[len('master ') :]
570	1		for intro in [
571			'of the ',
572			'of ',
573			'known as the ',
574			'with the ',
575			'with ',
576			]:
577	1		if full_tar.startswith(intro):
578	1		full_tar = full_tar[len(intro) :]
579
580	1		loc_ratio = sim_ratcliff_obershelp(full_src, full_tar)
581	1		return loc_ratio >= char_approx_min, loc_ratio
582
583	1		approx_c_result, ca_ratio = _approx_c()
584
585	1		if tests & self._test_dict['exact'] and fn_equal and ln_equal:
586	1		return _fmt_retval(self._match_type_dict['exact'])
587	1	View Code Duplication	if tests & self._test_dict['omission']:
			0 ignored issues – show Duplication introduced 2019-06-01 00:59 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
588	1		if (
589			fn_equal
590			and levenshtein(src_ln, tar_ln, cost=(1, 1, 99, 99)) == 1
591			):
592	1		if not roman_conflict:
593	1		return _fmt_retval(self._match_type_dict['omission'])
594	1		elif (
595			ln_equal
596			and levenshtein(src_fn, tar_fn, cost=(1, 1, 99, 99)) == 1
597			):
598	1		return _fmt_retval(self._match_type_dict['omission'])
599	1	View Code Duplication	if tests & self._test_dict['substitution']:
			0 ignored issues – show Duplication introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
600	1		if (
601			fn_equal
602			and levenshtein(src_ln, tar_ln, cost=(99, 99, 1, 99)) == 1
603			):
604	1		return _fmt_retval(self._match_type_dict['substitution'])
605	1		elif (
606			ln_equal
607			and levenshtein(src_fn, tar_fn, cost=(99, 99, 1, 99)) == 1
608			):
609	1		return _fmt_retval(self._match_type_dict['substitution'])
610	1	View Code Duplication	if tests & self._test_dict['transposition']:
			0 ignored issues – show Duplication introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
611	1		if fn_equal and (
612			levenshtein(src_ln, tar_ln, mode='osa', cost=(99, 99, 99, 1))
613			== 1
614			):
615	1		return _fmt_retval(self._match_type_dict['transposition'])
616	1		elif ln_equal and (
617			levenshtein(src_fn, tar_fn, mode='osa', cost=(99, 99, 99, 1))
618			== 1
619			):
620	1		return _fmt_retval(self._match_type_dict['transposition'])
621	1		if tests & self._test_dict['punctuation']:
622	1		np_src_fn = self._synoname_strip_punct(src_fn)
623	1		np_tar_fn = self._synoname_strip_punct(tar_fn)
624	1		np_src_ln = self._synoname_strip_punct(src_ln)
625	1		np_tar_ln = self._synoname_strip_punct(tar_ln)
626
627	1		if (np_src_fn == np_tar_fn) and (np_src_ln == np_tar_ln):
628	1		return _fmt_retval(self._match_type_dict['punctuation'])
629
630	1		np_src_fn = self._synoname_strip_punct(src_fn.replace('-', ' '))
631	1		np_tar_fn = self._synoname_strip_punct(tar_fn.replace('-', ' '))
632	1		np_src_ln = self._synoname_strip_punct(src_ln.replace('-', ' '))
633	1		np_tar_ln = self._synoname_strip_punct(tar_ln.replace('-', ' '))
634
635	1		if (np_src_fn == np_tar_fn) and (np_src_ln == np_tar_ln):
636	1		return _fmt_retval(self._match_type_dict['punctuation'])
637
638	1		if tests & self._test_dict['initials'] and ln_equal:
639	1		if src_fn and tar_fn:
640	1		src_initials = self._synoname_strip_punct(src_fn).split()
641	1		tar_initials = self._synoname_strip_punct(tar_fn).split()
642	1		initials = bool(
643			(len(src_initials) == len(''.join(src_initials)))
644			or (len(tar_initials) == len(''.join(tar_initials)))
645			)
646	1		if initials:
647	1		src_initials = ''.join(_[0] for _ in src_initials)
648	1		tar_initials = ''.join(_[0] for _ in tar_initials)
649	1		if src_initials == tar_initials:
650	1		return _fmt_retval(self._match_type_dict['initials'])
651	1		initial_diff = abs(len(src_initials) - len(tar_initials))
652	1		if initial_diff and (
653			(
654			initial_diff
655			== levenshtein(
656			src_initials,
657			tar_initials,
658			cost=(1, 99, 99, 99),
659			)
660			)
661			or (
662			initial_diff
663			== levenshtein(
664			tar_initials,
665			src_initials,
666			cost=(1, 99, 99, 99),
667			)
668			)
669			):
670	1		return _fmt_retval(self._match_type_dict['initials'])
671	1		if tests & self._test_dict['extension']:
672	1		if src_ln[1] == tar_ln[1] and (
673			src_ln.startswith(tar_ln) or tar_ln.startswith(src_ln)
674			):
675	1		if (
676			(not src_len_fn and not tar_len_fn)
677			or (tar_fn and src_fn.startswith(tar_fn))
678			or (src_fn and tar_fn.startswith(src_fn))
679			) and not roman_conflict:
680	1		return _fmt_retval(self._match_type_dict['extension'])
681	1		if tests & self._test_dict['inclusion'] and ln_equal:
682	1		if (src_fn and src_fn in tar_fn) or (tar_fn and tar_fn in src_ln):
683	1		return _fmt_retval(self._match_type_dict['inclusion'])
684	1		if tests & self._test_dict['no_first'] and ln_equal:
685	1		if src_fn == '' or tar_fn == '':
686	1		return _fmt_retval(self._match_type_dict['no_first'])
687	1		if tests & self._test_dict['word_approx']:
688	1		ratio = self._synoname_word_approximation(
689			src_ln,
690			tar_ln,
691			src_fn,
692			tar_fn,
693			{
694			'gen_conflict': gen_conflict,
695			'roman_conflict': roman_conflict,
696			'src_specials': src_specials,
697			'tar_specials': tar_specials,
698			},
699			)
700	1		if ratio == 1 and tests & self._test_dict['confusions']:
701	1		if (
702			' '.join((src_fn, src_ln)).strip()
703			== ' '.join((tar_fn, tar_ln)).strip()
704			):
705	1		return _fmt_retval(self._match_type_dict['confusions'])
706	1		if ratio >= word_approx_min:
707	1		return _fmt_retval(self._match_type_dict['word_approx'])
708	1		if tests & self._test_dict['char_approx']:
709	1		if ca_ratio >= char_approx_min:
710	1		return _fmt_retval(self._match_type_dict['char_approx'])
711	1		return _fmt_retval(self._match_type_dict['no_match'])
712
713	1		def dist(
714			self,
715			src,
716			tar,
717			word_approx_min=0.3,
718			char_approx_min=0.73,
719			tests=2 ** 12 - 1,
720			):
721			"""Return the normalized Synoname distance between two words.
722
723			Parameters
724			----------
725			src : str
726			Source string for comparison
727			tar : str
728			Target string for comparison
729			word_approx_min : float
730			The minimum word approximation value to signal a 'word_approx'
731			match
732			char_approx_min : float
733			The minimum character approximation value to signal a 'char_approx'
734			match
735			tests : int or Iterable
736			Either an integer indicating tests to perform or a list of test
737			names to perform (defaults to performing all tests)
738
739			Returns
740			-------
741			float
742			Normalized Synoname distance
743
744			"""
745	1		return (
746			synoname(src, tar, word_approx_min, char_approx_min, tests, False)
747			/ 14
748			)
749
750
751	1		def synoname(
752			src,
753			tar,
754			word_approx_min=0.3,
755			char_approx_min=0.73,
756			tests=2 ** 12 - 1,
757			ret_name=False,
758			):
759			"""Return the Synoname similarity type of two words.
760
761			This is a wrapper for :py:meth:`Synoname.dist_abs`.
762
763			Parameters
764			----------
765			src : str
766			Source string for comparison
767			tar : str
768			Target string for comparison
769			word_approx_min : float
770			The minimum word approximation value to signal a 'word_approx' match
771			char_approx_min : float
772			The minimum character approximation value to signal a 'char_approx'
773			match
774			tests : int or Iterable
775			Either an integer indicating tests to perform or a list of test names
776			to perform (defaults to performing all tests)
777			ret_name : bool
778			If True, returns the match name rather than its integer equivalent
779
780			Returns
781			-------
782			int (or str if ret_name is True)
783			Synoname value
784
785			Examples
786			--------
787			>>> synoname(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''))
788			2
789			>>> synoname(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''),
790			... ret_name=True)
791			'omission'
792			>>> synoname(('Dore', 'Gustave', ''),
793			... ('Dore', 'Paul Gustave Louis Christophe', ''), ret_name=True)
794			'inclusion'
795			>>> synoname(('Pereira', 'I. R.', ''), ('Pereira', 'I. Smith', ''),
796			... ret_name=True)
797			'word_approx'
798
799			"""
800	1		return Synoname().dist_abs(
801			src, tar, word_approx_min, char_approx_min, tests, ret_name
802			)
803
804
805			if __name__ == '__main__':
806			import doctest
807
808			doctest.testmod()
809

chrislit / abydos

Push — master ( 23810f...afe14d )

abydos.distance._synoname.Synoname.dist() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like