abydos.distance.synoname._synoname_strip_punct() - Code Metrics - Inspection of "Merge pull request #120 from chrislit/modularize" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Test Failed

Push — master ( 64abe2...a464fa )

by Chris

created 2018-10-19 22:32 UTC

abydos.distance.synoname._synoname_strip_punct() A

↳ Parent: abydos.distance.synoname

Complexity

Conditions

Size

Total Lines	14
Code Lines	6

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	3
eloc	6
nop	1
dl	0
loc	14
rs	10
c	0
b	0
f	0

# -*- coding: utf-8 -*-

# Copyright 2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance.synoname.

The distance.synoname module implements Synoname.
"""

from __future__ import division, unicode_literals

from collections import Iterable

from .levenshtein import levenshtein
from .sequence import sim_ratcliff_obershelp
# noinspection PyProtectedMember
from ..fingerprint.synoname import _synoname_special_table, synoname_toolcode

__all__ = ['synoname']


def _synoname_strip_punct(word):
    """Return a word with punctuation stripped out.

    :param word: a word to strip punctuation from
    :returns: The word stripped of punctuation

    >>> _synoname_strip_punct('AB;CD EF-GH$IJ')
    'ABCD EFGHIJ'
    """
    stripped = ''
    for char in word:
        if char not in set(',-./:;"&\'()!{|}?$%*+<=>[\\]^_`~'):
            stripped += char
    return stripped.strip()


def _synoname_word_approximation(src_ln, tar_ln, src_fn='', tar_fn='',
                                 features=None):
    """Return the Synoname word approximation score for two names.

    :param str src_ln: last name of the source
    :param str tar_ln: last name of the target
    :param str src_fn: first name of the source (optional)
    :param str tar_fn: first name of the target (optional)
    :param features: a dict containing special features calculated via
        fingerprint.synoname_toolcode() (optional)
    :returns: The word approximation score
    :rtype: float

    >>> _synoname_word_approximation('Smith Waterman', 'Waterman',
    ... 'Tom Joe Bob', 'Tom Joe')
    0.6
    """
    if features is None:
        features = {}
    if 'src_specials' not in features:
        features['src_specials'] = []
    if 'tar_specials' not in features:
        features['tar_specials'] = []

    src_len_specials = len(features['src_specials'])
    tar_len_specials = len(features['tar_specials'])

    # 1
    if (('gen_conflict' in features and features['gen_conflict']) or
            ('roman_conflict' in features and features['roman_conflict'])):
        return 0

    # 3 & 7
    full_tar1 = ' '.join((tar_ln, tar_fn)).replace('-', ' ').strip()
    for s_pos, s_type in features['tar_specials']:
        if s_type == 'a':
            full_tar1 = full_tar1[:-(1+len(_synoname_special_table[s_pos][1]))]
        elif s_type == 'b':
            loc = full_tar1.find(' '+_synoname_special_table[s_pos][1]+' ')+1
            full_tar1 = (full_tar1[:loc] +
                         full_tar1[loc +
                                   len(_synoname_special_table[s_pos][1]):])
        elif s_type == 'c':
            full_tar1 = full_tar1[1+len(_synoname_special_table[s_pos][1]):]

    full_src1 = ' '.join((src_ln, src_fn)).replace('-', ' ').strip()
    for s_pos, s_type in features['src_specials']:
        if s_type == 'a':
            full_src1 = full_src1[:-(1+len(_synoname_special_table[s_pos][1]))]
        elif s_type == 'b':
            loc = full_src1.find(' '+_synoname_special_table[s_pos][1]+' ')+1
            full_src1 = (full_src1[:loc] +
                         full_src1[loc +
                                   len(_synoname_special_table[s_pos][1]):])
        elif s_type == 'c':
            full_src1 = full_src1[1+len(_synoname_special_table[s_pos][1]):]

    full_tar2 = full_tar1
    for s_pos, s_type in features['tar_specials']:
        if s_type == 'd':
            full_tar2 = full_tar2[len(_synoname_special_table[s_pos][1]):]
        elif s_type == 'X' and _synoname_special_table[s_pos][1] in full_tar2:
            loc = full_tar2.find(' '+_synoname_special_table[s_pos][1])
            full_tar2 = (full_tar2[:loc] +
                         full_tar2[loc +
                                   len(_synoname_special_table[s_pos][1]):])

    full_src2 = full_src1
    for s_pos, s_type in features['src_specials']:
        if s_type == 'd':
            full_src2 = full_src2[len(_synoname_special_table[s_pos][1]):]
        elif s_type == 'X' and _synoname_special_table[s_pos][1] in full_src2:
            loc = full_src2.find(' '+_synoname_special_table[s_pos][1])
            full_src2 = (full_src2[:loc] +
                         full_src2[loc +
                                   len(_synoname_special_table[s_pos][1]):])

    full_tar1 = _synoname_strip_punct(full_tar1)
    tar1_words = full_tar1.split()
    tar1_num_words = len(tar1_words)

    full_src1 = _synoname_strip_punct(full_src1)
    src1_words = full_src1.split()
    src1_num_words = len(src1_words)

    full_tar2 = _synoname_strip_punct(full_tar2)
    tar2_words = full_tar2.split()
    tar2_num_words = len(tar2_words)

    full_src2 = _synoname_strip_punct(full_src2)
    src2_words = full_src2.split()
    src2_num_words = len(src2_words)

    # 2
    if (src1_num_words < 2 and src_len_specials == 0 and src2_num_words < 2 and
            tar_len_specials == 0):
        return 0

    # 4
    if (tar1_num_words == 1 and src1_num_words == 1 and
            tar1_words[0] == src1_words[0]):
        return 1
    if tar1_num_words < 2 and tar_len_specials == 0:
        return 0

    # 5
    last_found = False
    for word in tar1_words:
        if src_ln.endswith(word) or word+' ' in src_ln:
            last_found = True

    if not last_found:
        for word in src1_words:
            if tar_ln.endswith(word) or word+' ' in tar_ln:
                last_found = True

    # 6
    matches = 0
    if last_found:
        for i, s_word in enumerate(src1_words):
            for j, t_word in enumerate(tar1_words):
                if s_word == t_word:
                    src1_words[i] = '@'
                    tar1_words[j] = '@'
                    matches += 1
    w_ratio = matches/max(tar1_num_words, src1_num_words)
    if matches > 1 or (matches == 1 and
                       src1_num_words == 1 and tar1_num_words == 1 and
                       (tar_len_specials > 0 or src_len_specials > 0)):
        return w_ratio

    # 8
    if (tar2_num_words == 1 and src2_num_words == 1 and
            tar2_words[0] == src2_words[0]):
        return 1
    # I see no way that the following can be True if the equivalent in
    # #4 was False.
    if tar2_num_words < 2 and tar_len_specials == 0:  # pragma: no cover
        return 0

    # 9
    last_found = False
    for word in tar2_words:
        if src_ln.endswith(word) or word+' ' in src_ln:
            last_found = True

    if not last_found:
        for word in src2_words:
            if tar_ln.endswith(word) or word+' ' in tar_ln:
                last_found = True

    if not last_found:
        return 0

    # 10
    matches = 0
    if last_found:
        for i, s_word in enumerate(src2_words):
            for j, t_word in enumerate(tar2_words):
                if s_word == t_word:
                    src2_words[i] = '@'
                    tar2_words[j] = '@'
                    matches += 1
    w_ratio = matches/max(tar2_num_words, src2_num_words)
    if matches > 1 or (matches == 1 and
                       src2_num_words == 1 and tar2_num_words == 1 and
                       (tar_len_specials > 0 or src_len_specials > 0)):
        return w_ratio

    return 0


def synoname(src, tar, word_approx_min=0.3, char_approx_min=0.73,
             tests=2**12-1, ret_name=False):
    """Return the Synoname similarity type of two words.

    Cf. :cite:`Getty:1991,Gross:1991`

    :param str src: source string for comparison
    :param str tar: target string for comparison
    :param bool ret_name: return the name of the match type rather than the
        int value
    :param float word_approx_min: the minimum word approximation value to
        signal a 'word_approx' match
    :param float char_approx_min: the minimum character approximation value to
        signal a 'char_approx' match
    :param int or Iterable tests: either an integer indicating tests to
        perform or a list of test names to perform (defaults to performing all
        tests)
    :param bool ret_name: if True, returns the match name rather than its
        integer equivalent
    :returns: Synoname value
    :rtype: int (or str if ret_name is True)

    >>> synoname(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''))
    2
    >>> synoname(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''),
    ... ret_name=True)
    'omission'
    >>> synoname(('Dore', 'Gustave', ''),
    ... ('Dore', 'Paul Gustave Louis Christophe', ''),
    ... ret_name=True)
    'inclusion'
    >>> synoname(('Pereira', 'I. R.', ''), ('Pereira', 'I. Smith', ''),
    ... ret_name=True)
    'word_approx'
    """
    test_dict = {val: 2**n for n, val in enumerate([
        'exact', 'omission', 'substitution', 'transposition', 'punctuation',
        'initials', 'extension', 'inclusion', 'no_first', 'word_approx',
        'confusions', 'char_approx'])}
    match_name = ['', 'exact', 'omission', 'substitution', 'transposition',
                  'punctuation', 'initials', 'extension', 'inclusion',
                  'no_first', 'word_approx', 'confusions', 'char_approx',
                  'no_match']
    match_type_dict = {val: n for n, val in enumerate(match_name)}

    if isinstance(tests, Iterable):
        new_tests = 0
        for term in tests:
            if term in test_dict:
                new_tests += test_dict[term]
        tests = new_tests

    if isinstance(src, tuple):
        src_ln, src_fn, src_qual = src
    elif '#' in src:
        src_ln, src_fn, src_qual = src.split('#')[-3:]
    else:
        src_ln, src_fn, src_qual = src, '', ''

    if isinstance(tar, tuple):
        tar_ln, tar_fn, tar_qual = tar
    elif '#' in tar:
        tar_ln, tar_fn, tar_qual = tar.split('#')[-3:]
    else:
        tar_ln, tar_fn, tar_qual = tar, '', ''

    def _split_special(spec):
        spec_list = []
        while spec:
            spec_list.append((int(spec[:3]), spec[3:4]))
            spec = spec[4:]
        return spec_list

    def _fmt_retval(val):
        if ret_name:
            return match_name[val]
        return val

    # 1. Preprocessing

    # Lowercasing
    src_fn = src_fn.strip().lower()
    src_ln = src_ln.strip().lower()
    src_qual = src_qual.strip().lower()

    tar_fn = tar_fn.strip().lower()
    tar_ln = tar_ln.strip().lower()
    tar_qual = tar_qual.strip().lower()

    # Create toolcodes
    src_ln, src_fn, src_tc = synoname_toolcode(src_ln, src_fn, src_qual)
    tar_ln, tar_fn, tar_tc = synoname_toolcode(tar_ln, tar_fn, tar_qual)

    src_generation = int(src_tc[2])
    src_romancode = int(src_tc[3:6])
    src_len_fn = int(src_tc[6:8])
    src_tc = src_tc.split('$')
    src_specials = _split_special(src_tc[1])

    tar_generation = int(tar_tc[2])
    tar_romancode = int(tar_tc[3:6])
    tar_len_fn = int(tar_tc[6:8])
    tar_tc = tar_tc.split('$')
    tar_specials = _split_special(tar_tc[1])

    gen_conflict = ((src_generation != tar_generation) and
                    bool(src_generation or tar_generation))
    roman_conflict = ((src_romancode != tar_romancode) and
                      bool(src_romancode or tar_romancode))

    ln_equal = src_ln == tar_ln
    fn_equal = src_fn == tar_fn

    # approx_c
    def _approx_c():
        if gen_conflict or roman_conflict:
            return False, 0

        full_src = ' '.join((src_ln, src_fn))
        if full_src.startswith('master '):
            full_src = full_src[len('master '):]
            for intro in ['of the ', 'of ', 'known as the ', 'with the ',
                          'with ']:
                if full_src.startswith(intro):
                    full_src = full_src[len(intro):]

        full_tar = ' '.join((tar_ln, tar_fn))
        if full_tar.startswith('master '):
            full_tar = full_tar[len('master '):]
            for intro in ['of the ', 'of ', 'known as the ', 'with the ',
                          'with ']:
                if full_tar.startswith(intro):
                    full_tar = full_tar[len(intro):]

        loc_ratio = sim_ratcliff_obershelp(full_src, full_tar)
        return loc_ratio >= char_approx_min, loc_ratio

    approx_c_result, ca_ratio = _approx_c()

    if tests & test_dict['exact'] and fn_equal and ln_equal:
        return _fmt_retval(match_type_dict['exact'])
    if tests & test_dict['omission']:
        if (fn_equal and
                levenshtein(src_ln, tar_ln, cost=(1, 1, 99, 99)) == 1):
            if not roman_conflict:
                return _fmt_retval(match_type_dict['omission'])
        elif (ln_equal and
              levenshtein(src_fn, tar_fn, cost=(1, 1, 99, 99)) == 1):
            return _fmt_retval(match_type_dict['omission'])
    if tests & test_dict['substitution']:
        if (fn_equal and
                levenshtein(src_ln, tar_ln, cost=(99, 99, 1, 99)) == 1):
            return _fmt_retval(match_type_dict['substitution'])
        elif (ln_equal and
              levenshtein(src_fn, tar_fn, cost=(99, 99, 1, 99)) == 1):
            return _fmt_retval(match_type_dict['substitution'])
    if tests & test_dict['transposition']:
        if (fn_equal and
                (levenshtein(src_ln, tar_ln, mode='osa', cost=(99, 99, 99, 1))
                 == 1)):
            return _fmt_retval(match_type_dict['transposition'])
        elif (ln_equal and
              (levenshtein(src_fn, tar_fn, mode='osa', cost=(99, 99, 99, 1))
               == 1)):
            return _fmt_retval(match_type_dict['transposition'])
    if tests & test_dict['punctuation']:
        np_src_fn = _synoname_strip_punct(src_fn)
        np_tar_fn = _synoname_strip_punct(tar_fn)
        np_src_ln = _synoname_strip_punct(src_ln)
        np_tar_ln = _synoname_strip_punct(tar_ln)

        if (np_src_fn == np_tar_fn) and (np_src_ln == np_tar_ln):
            return _fmt_retval(match_type_dict['punctuation'])

        np_src_fn = _synoname_strip_punct(src_fn.replace('-', ' '))
        np_tar_fn = _synoname_strip_punct(tar_fn.replace('-', ' '))
        np_src_ln = _synoname_strip_punct(src_ln.replace('-', ' '))
        np_tar_ln = _synoname_strip_punct(tar_ln.replace('-', ' '))

        if (np_src_fn == np_tar_fn) and (np_src_ln == np_tar_ln):
            return _fmt_retval(match_type_dict['punctuation'])

    if tests & test_dict['initials'] and ln_equal:
        if src_fn and tar_fn:
            src_initials = _synoname_strip_punct(src_fn).split()
            tar_initials = _synoname_strip_punct(tar_fn).split()
            initials = bool((len(src_initials) == len(''.join(src_initials)))
                            or
                            (len(tar_initials) == len(''.join(tar_initials))))
            if initials:
                src_initials = ''.join(_[0] for _ in src_initials)
                tar_initials = ''.join(_[0] for _ in tar_initials)
                if src_initials == tar_initials:
                    return _fmt_retval(match_type_dict['initials'])
                initial_diff = abs(len(src_initials)-len(tar_initials))
                if (initial_diff and
                        ((initial_diff ==
                          levenshtein(src_initials, tar_initials,
                                      cost=(1, 99, 99, 99))) or
                         (initial_diff ==
                          levenshtein(tar_initials, src_initials,
                                      cost=(1, 99, 99, 99))))):
                    return _fmt_retval(match_type_dict['initials'])
    if tests & test_dict['extension']:
        if src_ln[1] == tar_ln[1] and (src_ln.startswith(tar_ln) or
                                       tar_ln.startswith(src_ln)):
            if (((not src_len_fn and not tar_len_fn) or
                 (tar_fn and src_fn.startswith(tar_fn)) or
                 (src_fn and tar_fn.startswith(src_fn)))
                    and not roman_conflict):
                return _fmt_retval(match_type_dict['extension'])
    if tests & test_dict['inclusion'] and ln_equal:
        if (src_fn and src_fn in tar_fn) or (tar_fn and tar_fn in src_ln):
            return _fmt_retval(match_type_dict['inclusion'])
    if tests & test_dict['no_first'] and ln_equal:
        if src_fn == '' or tar_fn == '':
            return _fmt_retval(match_type_dict['no_first'])
    if tests & test_dict['word_approx']:
        ratio = _synoname_word_approximation(src_ln, tar_ln, src_fn, tar_fn,
                                             {'gen_conflict': gen_conflict,
                                              'roman_conflict': roman_conflict,
                                              'src_specials': src_specials,
                                              'tar_specials': tar_specials})
        if ratio == 1 and tests & test_dict['confusions']:
            if (' '.join((src_fn, src_ln)).strip() ==
                    ' '.join((tar_fn, tar_ln)).strip()):
                return _fmt_retval(match_type_dict['confusions'])
        if ratio >= word_approx_min:
            return _fmt_retval(match_type_dict['word_approx'])
    if tests & test_dict['char_approx']:
        if ca_ratio >= char_approx_min:
            return _fmt_retval(match_type_dict['char_approx'])
    return _fmt_retval(match_type_dict['no_match'])


if __name__ == '__main__':
    import doctest
    doctest.testmod()


1			# -- coding: utf-8 --
2
3			# Copyright 2018 by Christopher C. Little.
4			# This file is part of Abydos.
5			#
6			# Abydos is free software: you can redistribute it and/or modify
7			# it under the terms of the GNU General Public License as published by
8			# the Free Software Foundation, either version 3 of the License, or
9			# (at your option) any later version.
10			#
11			# Abydos is distributed in the hope that it will be useful,
12			# but WITHOUT ANY WARRANTY; without even the implied warranty of
13			# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14			# GNU General Public License for more details.
15			#
16			# You should have received a copy of the GNU General Public License
17			# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19			"""abydos.distance.synoname.
20
21			The distance.synoname module implements Synoname.
22			"""
23
24			from __future__ import division, unicode_literals
25
26			from collections import Iterable
27
28			from .levenshtein import levenshtein
29			from .sequence import sim_ratcliff_obershelp
30			# noinspection PyProtectedMember
31			from ..fingerprint.synoname import _synoname_special_table, synoname_toolcode
32
33			__all__ = ['synoname']
34
35
36			def _synoname_strip_punct(word):
37			"""Return a word with punctuation stripped out.
38
39			:param word: a word to strip punctuation from
40			:returns: The word stripped of punctuation
41
42			>>> _synoname_strip_punct('AB;CD EF-GH$IJ')
43			'ABCD EFGHIJ'
44			"""
45			stripped = ''
46			for char in word:
47			if char not in set(',-./:;"&\'()!{\|}?$%*+<=>[\\]^_`~'):
48			stripped += char
49			return stripped.strip()
50
51
52			def _synoname_word_approximation(src_ln, tar_ln, src_fn='', tar_fn='',
53			features=None):
54			"""Return the Synoname word approximation score for two names.
55
56			:param str src_ln: last name of the source
57			:param str tar_ln: last name of the target
58			:param str src_fn: first name of the source (optional)
59			:param str tar_fn: first name of the target (optional)
60			:param features: a dict containing special features calculated via
61			fingerprint.synoname_toolcode() (optional)
62			:returns: The word approximation score
63			:rtype: float
64
65			>>> _synoname_word_approximation('Smith Waterman', 'Waterman',
66			... 'Tom Joe Bob', 'Tom Joe')
67			0.6
68			"""
69			if features is None:
70			features = {}
71			if 'src_specials' not in features:
72			features['src_specials'] = []
73			if 'tar_specials' not in features:
74			features['tar_specials'] = []
75
76			src_len_specials = len(features['src_specials'])
77			tar_len_specials = len(features['tar_specials'])
78
79			# 1
80			if (('gen_conflict' in features and features['gen_conflict']) or
81			('roman_conflict' in features and features['roman_conflict'])):
82			return 0
83
84			# 3 & 7
85			full_tar1 = ' '.join((tar_ln, tar_fn)).replace('-', ' ').strip()
86			for s_pos, s_type in features['tar_specials']:
87			if s_type == 'a':
88			full_tar1 = full_tar1[:-(1+len(_synoname_special_table[s_pos][1]))]
89			elif s_type == 'b':
90			loc = full_tar1.find(' '+_synoname_special_table[s_pos][1]+' ')+1
91			full_tar1 = (full_tar1[:loc] +
92			full_tar1[loc +
93			len(_synoname_special_table[s_pos][1]):])
94			elif s_type == 'c':
95			full_tar1 = full_tar1[1+len(_synoname_special_table[s_pos][1]):]
96
97			full_src1 = ' '.join((src_ln, src_fn)).replace('-', ' ').strip()
98			for s_pos, s_type in features['src_specials']:
99			if s_type == 'a':
100			full_src1 = full_src1[:-(1+len(_synoname_special_table[s_pos][1]))]
101			elif s_type == 'b':
102			loc = full_src1.find(' '+_synoname_special_table[s_pos][1]+' ')+1
103			full_src1 = (full_src1[:loc] +
104			full_src1[loc +
105			len(_synoname_special_table[s_pos][1]):])
106			elif s_type == 'c':
107			full_src1 = full_src1[1+len(_synoname_special_table[s_pos][1]):]
108
109			full_tar2 = full_tar1
110			for s_pos, s_type in features['tar_specials']:
111			if s_type == 'd':
112			full_tar2 = full_tar2[len(_synoname_special_table[s_pos][1]):]
113			elif s_type == 'X' and _synoname_special_table[s_pos][1] in full_tar2:
114			loc = full_tar2.find(' '+_synoname_special_table[s_pos][1])
115			full_tar2 = (full_tar2[:loc] +
116			full_tar2[loc +
117			len(_synoname_special_table[s_pos][1]):])
118
119			full_src2 = full_src1
120			for s_pos, s_type in features['src_specials']:
121			if s_type == 'd':
122			full_src2 = full_src2[len(_synoname_special_table[s_pos][1]):]
123			elif s_type == 'X' and _synoname_special_table[s_pos][1] in full_src2:
124			loc = full_src2.find(' '+_synoname_special_table[s_pos][1])
125			full_src2 = (full_src2[:loc] +
126			full_src2[loc +
127			len(_synoname_special_table[s_pos][1]):])
128
129			full_tar1 = _synoname_strip_punct(full_tar1)
130			tar1_words = full_tar1.split()
131			tar1_num_words = len(tar1_words)
132
133			full_src1 = _synoname_strip_punct(full_src1)
134			src1_words = full_src1.split()
135			src1_num_words = len(src1_words)
136
137			full_tar2 = _synoname_strip_punct(full_tar2)
138			tar2_words = full_tar2.split()
139			tar2_num_words = len(tar2_words)
140
141			full_src2 = _synoname_strip_punct(full_src2)
142			src2_words = full_src2.split()
143			src2_num_words = len(src2_words)
144
145			# 2
146			if (src1_num_words < 2 and src_len_specials == 0 and src2_num_words < 2 and
147			tar_len_specials == 0):
148			return 0
149
150			# 4
151			if (tar1_num_words == 1 and src1_num_words == 1 and
152			tar1_words[0] == src1_words[0]):
153			return 1
154			if tar1_num_words < 2 and tar_len_specials == 0:
155			return 0
156
157			# 5
158			last_found = False
159			for word in tar1_words:
160			if src_ln.endswith(word) or word+' ' in src_ln:
161			last_found = True
162
163			if not last_found:
164			for word in src1_words:
165			if tar_ln.endswith(word) or word+' ' in tar_ln:
166			last_found = True
167
168			# 6
169			matches = 0
170			if last_found:
171			for i, s_word in enumerate(src1_words):
172			for j, t_word in enumerate(tar1_words):
173			if s_word == t_word:
174			src1_words[i] = '@'
175			tar1_words[j] = '@'
176			matches += 1
177			w_ratio = matches/max(tar1_num_words, src1_num_words)
178			if matches > 1 or (matches == 1 and
179			src1_num_words == 1 and tar1_num_words == 1 and
180			(tar_len_specials > 0 or src_len_specials > 0)):
181			return w_ratio
182
183			# 8
184			if (tar2_num_words == 1 and src2_num_words == 1 and
185			tar2_words[0] == src2_words[0]):
186			return 1
187			# I see no way that the following can be True if the equivalent in
188			# #4 was False.
189			if tar2_num_words < 2 and tar_len_specials == 0: # pragma: no cover
190			return 0
191
192			# 9
193			last_found = False
194			for word in tar2_words:
195			if src_ln.endswith(word) or word+' ' in src_ln:
196			last_found = True
197
198			if not last_found:
199			for word in src2_words:
200			if tar_ln.endswith(word) or word+' ' in tar_ln:
201			last_found = True
202
203			if not last_found:
204			return 0
205
206			# 10
207			matches = 0
208			if last_found:
209			for i, s_word in enumerate(src2_words):
210			for j, t_word in enumerate(tar2_words):
211			if s_word == t_word:
212			src2_words[i] = '@'
213			tar2_words[j] = '@'
214			matches += 1
215			w_ratio = matches/max(tar2_num_words, src2_num_words)
216			if matches > 1 or (matches == 1 and
217			src2_num_words == 1 and tar2_num_words == 1 and
218			(tar_len_specials > 0 or src_len_specials > 0)):
219			return w_ratio
220
221			return 0
222
223
224			def synoname(src, tar, word_approx_min=0.3, char_approx_min=0.73,
225			tests=2**12-1, ret_name=False):
226			"""Return the Synoname similarity type of two words.
227
228			Cf. :cite:`Getty:1991,Gross:1991`
229
230			:param str src: source string for comparison
231			:param str tar: target string for comparison
232			:param bool ret_name: return the name of the match type rather than the
233			int value
234			:param float word_approx_min: the minimum word approximation value to
235			signal a 'word_approx' match
236			:param float char_approx_min: the minimum character approximation value to
237			signal a 'char_approx' match
238			:param int or Iterable tests: either an integer indicating tests to
239			perform or a list of test names to perform (defaults to performing all
240			tests)
241			:param bool ret_name: if True, returns the match name rather than its
242			integer equivalent
243			:returns: Synoname value
244			:rtype: int (or str if ret_name is True)
245
246			>>> synoname(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''))
247			2
248			>>> synoname(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''),
249			... ret_name=True)
250			'omission'
251			>>> synoname(('Dore', 'Gustave', ''),
252			... ('Dore', 'Paul Gustave Louis Christophe', ''),
253			... ret_name=True)
254			'inclusion'
255			>>> synoname(('Pereira', 'I. R.', ''), ('Pereira', 'I. Smith', ''),
256			... ret_name=True)
257			'word_approx'
258			"""
259			test_dict = {val: 2**n for n, val in enumerate([
260			'exact', 'omission', 'substitution', 'transposition', 'punctuation',
261			'initials', 'extension', 'inclusion', 'no_first', 'word_approx',
262			'confusions', 'char_approx'])}
263			match_name = ['', 'exact', 'omission', 'substitution', 'transposition',
264			'punctuation', 'initials', 'extension', 'inclusion',
265			'no_first', 'word_approx', 'confusions', 'char_approx',
266			'no_match']
267			match_type_dict = {val: n for n, val in enumerate(match_name)}
268
269			if isinstance(tests, Iterable):
270			new_tests = 0
271			for term in tests:
272			if term in test_dict:
273			new_tests += test_dict[term]
274			tests = new_tests
275
276			if isinstance(src, tuple):
277			src_ln, src_fn, src_qual = src
278			elif '#' in src:
279			src_ln, src_fn, src_qual = src.split('#')[-3:]
280			else:
281			src_ln, src_fn, src_qual = src, '', ''
282
283			if isinstance(tar, tuple):
284			tar_ln, tar_fn, tar_qual = tar
285			elif '#' in tar:
286			tar_ln, tar_fn, tar_qual = tar.split('#')[-3:]
287			else:
288			tar_ln, tar_fn, tar_qual = tar, '', ''
289
290			def _split_special(spec):
291			spec_list = []
292			while spec:
293			spec_list.append((int(spec[:3]), spec[3:4]))
294			spec = spec[4:]
295			return spec_list
296
297			def _fmt_retval(val):
298			if ret_name:
299			return match_name[val]
300			return val
301
302			# 1. Preprocessing
303
304			# Lowercasing
305			src_fn = src_fn.strip().lower()
306			src_ln = src_ln.strip().lower()
307			src_qual = src_qual.strip().lower()
308
309			tar_fn = tar_fn.strip().lower()
310			tar_ln = tar_ln.strip().lower()
311			tar_qual = tar_qual.strip().lower()
312
313			# Create toolcodes
314			src_ln, src_fn, src_tc = synoname_toolcode(src_ln, src_fn, src_qual)
315			tar_ln, tar_fn, tar_tc = synoname_toolcode(tar_ln, tar_fn, tar_qual)
316
317			src_generation = int(src_tc[2])
318			src_romancode = int(src_tc[3:6])
319			src_len_fn = int(src_tc[6:8])
320			src_tc = src_tc.split('$')
321			src_specials = _split_special(src_tc[1])
322
323			tar_generation = int(tar_tc[2])
324			tar_romancode = int(tar_tc[3:6])
325			tar_len_fn = int(tar_tc[6:8])
326			tar_tc = tar_tc.split('$')
327			tar_specials = _split_special(tar_tc[1])
328
329			gen_conflict = ((src_generation != tar_generation) and
330			bool(src_generation or tar_generation))
331			roman_conflict = ((src_romancode != tar_romancode) and
332			bool(src_romancode or tar_romancode))
333
334			ln_equal = src_ln == tar_ln
335			fn_equal = src_fn == tar_fn
336
337			# approx_c
338			def _approx_c():
339			if gen_conflict or roman_conflict:
340			return False, 0
341
342			full_src = ' '.join((src_ln, src_fn))
343			if full_src.startswith('master '):
344			full_src = full_src[len('master '):]
345			for intro in ['of the ', 'of ', 'known as the ', 'with the ',
346			'with ']:
347			if full_src.startswith(intro):
348			full_src = full_src[len(intro):]
349
350			full_tar = ' '.join((tar_ln, tar_fn))
351			if full_tar.startswith('master '):
352			full_tar = full_tar[len('master '):]
353			for intro in ['of the ', 'of ', 'known as the ', 'with the ',
354			'with ']:
355			if full_tar.startswith(intro):
356			full_tar = full_tar[len(intro):]
357
358			loc_ratio = sim_ratcliff_obershelp(full_src, full_tar)
359			return loc_ratio >= char_approx_min, loc_ratio
360
361			approx_c_result, ca_ratio = _approx_c()
362
363			if tests & test_dict['exact'] and fn_equal and ln_equal:
364			return _fmt_retval(match_type_dict['exact'])
365			if tests & test_dict['omission']:
366			if (fn_equal and
367			levenshtein(src_ln, tar_ln, cost=(1, 1, 99, 99)) == 1):
368			if not roman_conflict:
369			return _fmt_retval(match_type_dict['omission'])
370			elif (ln_equal and
371			levenshtein(src_fn, tar_fn, cost=(1, 1, 99, 99)) == 1):
372			return _fmt_retval(match_type_dict['omission'])
373			if tests & test_dict['substitution']:
374			if (fn_equal and
375			levenshtein(src_ln, tar_ln, cost=(99, 99, 1, 99)) == 1):
376			return _fmt_retval(match_type_dict['substitution'])
377			elif (ln_equal and
378			levenshtein(src_fn, tar_fn, cost=(99, 99, 1, 99)) == 1):
379			return _fmt_retval(match_type_dict['substitution'])
380			if tests & test_dict['transposition']:
381			if (fn_equal and
382			(levenshtein(src_ln, tar_ln, mode='osa', cost=(99, 99, 99, 1))
383			== 1)):
384			return _fmt_retval(match_type_dict['transposition'])
385			elif (ln_equal and
386			(levenshtein(src_fn, tar_fn, mode='osa', cost=(99, 99, 99, 1))
387			== 1)):
388			return _fmt_retval(match_type_dict['transposition'])
389			if tests & test_dict['punctuation']:
390			np_src_fn = _synoname_strip_punct(src_fn)
391			np_tar_fn = _synoname_strip_punct(tar_fn)
392			np_src_ln = _synoname_strip_punct(src_ln)
393			np_tar_ln = _synoname_strip_punct(tar_ln)
394
395			if (np_src_fn == np_tar_fn) and (np_src_ln == np_tar_ln):
396			return _fmt_retval(match_type_dict['punctuation'])
397
398			np_src_fn = _synoname_strip_punct(src_fn.replace('-', ' '))
399			np_tar_fn = _synoname_strip_punct(tar_fn.replace('-', ' '))
400			np_src_ln = _synoname_strip_punct(src_ln.replace('-', ' '))
401			np_tar_ln = _synoname_strip_punct(tar_ln.replace('-', ' '))
402
403			if (np_src_fn == np_tar_fn) and (np_src_ln == np_tar_ln):
404			return _fmt_retval(match_type_dict['punctuation'])
405
406			if tests & test_dict['initials'] and ln_equal:
407			if src_fn and tar_fn:
408			src_initials = _synoname_strip_punct(src_fn).split()
409			tar_initials = _synoname_strip_punct(tar_fn).split()
410			initials = bool((len(src_initials) == len(''.join(src_initials)))
411			or
412			(len(tar_initials) == len(''.join(tar_initials))))
413			if initials:
414			src_initials = ''.join(_[0] for _ in src_initials)
415			tar_initials = ''.join(_[0] for _ in tar_initials)
416			if src_initials == tar_initials:
417			return _fmt_retval(match_type_dict['initials'])
418			initial_diff = abs(len(src_initials)-len(tar_initials))
419			if (initial_diff and
420			((initial_diff ==
421			levenshtein(src_initials, tar_initials,
422			cost=(1, 99, 99, 99))) or
423			(initial_diff ==
424			levenshtein(tar_initials, src_initials,
425			cost=(1, 99, 99, 99))))):
426			return _fmt_retval(match_type_dict['initials'])
427			if tests & test_dict['extension']:
428			if src_ln[1] == tar_ln[1] and (src_ln.startswith(tar_ln) or
429			tar_ln.startswith(src_ln)):
430			if (((not src_len_fn and not tar_len_fn) or
431			(tar_fn and src_fn.startswith(tar_fn)) or
432			(src_fn and tar_fn.startswith(src_fn)))
433			and not roman_conflict):
434			return _fmt_retval(match_type_dict['extension'])
435			if tests & test_dict['inclusion'] and ln_equal:
436			if (src_fn and src_fn in tar_fn) or (tar_fn and tar_fn in src_ln):
437			return _fmt_retval(match_type_dict['inclusion'])
438			if tests & test_dict['no_first'] and ln_equal:
439			if src_fn == '' or tar_fn == '':
440			return _fmt_retval(match_type_dict['no_first'])
441			if tests & test_dict['word_approx']:
442			ratio = _synoname_word_approximation(src_ln, tar_ln, src_fn, tar_fn,
443			{'gen_conflict': gen_conflict,
444			'roman_conflict': roman_conflict,
445			'src_specials': src_specials,
446			'tar_specials': tar_specials})
447			if ratio == 1 and tests & test_dict['confusions']:
448			if (' '.join((src_fn, src_ln)).strip() ==
449			' '.join((tar_fn, tar_ln)).strip()):
450			return _fmt_retval(match_type_dict['confusions'])
451			if ratio >= word_approx_min:
452			return _fmt_retval(match_type_dict['word_approx'])
453			if tests & test_dict['char_approx']:
454			if ca_ratio >= char_approx_min:
455			return _fmt_retval(match_type_dict['char_approx'])
456			return _fmt_retval(match_type_dict['no_match'])
457
458
459			if __name__ == '__main__':
460			import doctest
461			doctest.testmod()
462

chrislit / abydos

Push — master ( 64abe2...a464fa )

abydos.distance.synoname._synoname_strip_punct() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like