abydos.distance._strcmp95.sim_strcmp95() - Code Metrics - Inspection of "0.3.6" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#141)

by Chris

created 2018-11-10 03:25 UTC

abydos.distance._strcmp95.sim_strcmp95() A

↳ Parent: abydos.distance._strcmp95

Complexity

Conditions

Size

Total Lines	29
Code Lines	2

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	2
CRAP Score	1

Importance

Changes

Metric	Value
cc	1
eloc	2
nop	3
dl	0
loc	29
ccs	2
cts	2
cp	1
crap	1
rs	10
c	0
b	0
f	0

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._strcmp95.

The strcmp95 algorithm variant of Jaro-Winkler distance
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from collections import defaultdict

from six.moves import range

from ._distance import _Distance

__all__ = ['Strcmp95', 'dist_strcmp95', 'sim_strcmp95']


class Strcmp95(_Distance):

    """Strcmp95.

    This is a Python translation of the C code for strcmp95:
    http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
    :cite:`Winkler:1994`.
    The above file is a US Government publication and, accordingly,
    in the public domain.

    This is based on the Jaro-Winkler distance, but also attempts to correct
    for some common typos and frequently confused characters. It is also
    limited to uppercase ASCII characters, so it is appropriate to American
    names, but not much else.
    """

    _sp_mx = (
        ('A', 'E'),
        ('A', 'I'),
        ('A', 'O'),
        ('A', 'U'),
        ('B', 'V'),
        ('E', 'I'),
        ('E', 'O'),
        ('E', 'U'),
        ('I', 'O'),
        ('I', 'U'),
        ('O', 'U'),
        ('I', 'Y'),
        ('E', 'Y'),
        ('C', 'G'),
        ('E', 'F'),
        ('W', 'U'),
        ('W', 'V'),
        ('X', 'K'),
        ('S', 'Z'),
        ('X', 'S'),
        ('Q', 'C'),
        ('U', 'V'),
        ('M', 'N'),
        ('L', 'I'),
        ('Q', 'O'),
        ('P', 'R'),
        ('I', 'J'),
        ('2', 'Z'),
        ('5', 'S'),
        ('8', 'B'),
        ('1', 'I'),
        ('1', 'L'),
        ('0', 'O'),
        ('0', 'Q'),
        ('C', 'K'),
        ('G', 'J'),
    )

    def sim(self, src, tar, long_strings=False):

        """Return the strcmp95 similarity of two strings.

        Args:
            src (str): Source string for comparison
            tar (str): Target string for comparison
            long_strings (bool): Set to True to increase the probability of a
                match when the number of matched characters is large. This
                option allows for a little more tolerance when the strings are
                large. It is not an appropriate test when comparing fixed
                length fields such as phone and social security numbers.

        Returns:
            float: Strcmp95 similarity

        Examples:
            >>> cmp = Strcmp95()
            >>> cmp.sim('cat', 'hat')
            0.7777777777777777
            >>> cmp.sim('Niall', 'Neil')
            0.8454999999999999
            >>> cmp.sim('aluminum', 'Catalan')
            0.6547619047619048
            >>> cmp.sim('ATCG', 'TAGC')
            0.8333333333333334

        """

        def _in_range(char):
            """Return True if char is in the range (0, 91).

            Args:
                char (str): The character to check

            Returns:
                bool: True if char is in the range (0, 91)

            """
            return 91 > ord(char) > 0

        ying = src.strip().upper()
        yang = tar.strip().upper()

        if ying == yang:
            return 1.0
        # If either string is blank - return - added in Version 2
        if not ying or not yang:
            return 0.0

        adjwt = defaultdict(int)

        # Initialize the adjwt array on the first call to the function only.
        # The adjwt array is used to give partial credit for characters that
        # may be errors due to known phonetic or character recognition errors.
        # A typical example is to match the letter "O" with the number "0"
        for i in self._sp_mx:
            adjwt[(i[0], i[1])] = 3
            adjwt[(i[1], i[0])] = 3

        if len(ying) > len(yang):
            search_range = len(ying)
            minv = len(yang)
        else:
            search_range = len(yang)
            minv = len(ying)

        # Blank out the flags
        ying_flag = [0] * search_range
        yang_flag = [0] * search_range
        search_range = max(0, search_range // 2 - 1)

        # Looking only within the search range,
        # count and flag the matched pairs.
        num_com = 0
        yl1 = len(yang) - 1
        for i in range(len(ying)):

            low_lim = (i - search_range) if (i >= search_range) else 0
            hi_lim = (i + search_range) if ((i + search_range) <= yl1) else yl1
            for j in range(low_lim, hi_lim + 1):
                if (yang_flag[j] == 0) and (yang[j] == ying[i]):
                    yang_flag[j] = 1
                    ying_flag[i] = 1
                    num_com += 1
                    break

        # If no characters in common - return
        if num_com == 0:
            return 0.0

        # Count the number of transpositions
        k = n_trans = 0
        for i in range(len(ying)):

            if ying_flag[i] != 0:
                j = 0
                for j in range(k, len(yang)):  # pragma: no branch
                    if yang_flag[j] != 0:
                        k = j + 1
                        break
                if ying[i] != yang[j]:
                    n_trans += 1
        n_trans //= 2

        # Adjust for similarities in unmatched characters
        n_simi = 0
        if minv > num_com:

            for i in range(len(ying)):

                if ying_flag[i] == 0 and _in_range(ying[i]):
                    for j in range(len(yang)):

                        if yang_flag[j] == 0 and _in_range(yang[j]):
                            if (ying[i], yang[j]) in adjwt:
                                n_simi += adjwt[(ying[i], yang[j])]
                                yang_flag[j] = 2
                                break
        num_sim = n_simi / 10.0 + num_com

        # Main weight computation
        weight = (
            num_sim / len(ying)
            + num_sim / len(yang)
            + (num_com - n_trans) / num_com
        )
        weight /= 3.0

        # Continue to boost the weight if the strings are similar
        if weight > 0.7:

            # Adjust for having up to the first 4 characters in common
            j = 4 if (minv >= 4) else minv
            i = 0
            while (i < j) and (ying[i] == yang[i]) and (not ying[i].isdigit()):
                i += 1
            if i:
                weight += i * 0.1 * (1.0 - weight)

            # Optionally adjust for long strings.

            # After agreeing beginning chars, at least two more must agree and
            # the agreeing characters must be > .5 of remaining characters.
            if (
                long_strings

                and (minv > 4)

                and (num_com > i + 1)

                and (2 * num_com >= minv + i)

            ):
                if not ying[0].isdigit():
                    weight += (1.0 - weight) * (
                        (num_com - i - 1) / (len(ying) + len(yang) - i * 2 + 2)
                    )

        return weight


def sim_strcmp95(src, tar, long_strings=False):
    """Return the strcmp95 similarity of two strings.

    This is a wrapper for :py:meth:`Strcmp95.sim`.

    Args:
        src (str): Source string for comparison
        tar (str): Target string for comparison
        long_strings (bool): Set to True to increase the probability of a
            match when the number of matched characters is large. This option
            allows for a little more tolerance when the strings are large. It
            is not an appropriate test when comparing fixed length fields such
            as phone and social security numbers.

    Returns:
        float: strcmp95 similarity

    Examples:
        >>> sim_strcmp95('cat', 'hat')
        0.7777777777777777
        >>> sim_strcmp95('Niall', 'Neil')
        0.8454999999999999
        >>> sim_strcmp95('aluminum', 'Catalan')
        0.6547619047619048
        >>> sim_strcmp95('ATCG', 'TAGC')
        0.8333333333333334

    """
    return Strcmp95().sim(src, tar, long_strings)


def dist_strcmp95(src, tar, long_strings=False):
    """Return the strcmp95 distance between two strings.

    This is a wrapper for :py:meth:`Strcmp95.dist`.

    Args:
        src (str): Source string for comparison
        tar (str): Target string for comparison
        long_strings (bool): Set to True to increase the probability of a
            match when the number of matched characters is large. This option
            allows for a little more tolerance when the strings are large. It
            is not an appropriate test when comparing fixed length fields such
            as phone and social security numbers.

    Returns:
        float: strcmp95 distance

    Examples:
        >>> round(dist_strcmp95('cat', 'hat'), 12)
        0.222222222222
        >>> round(dist_strcmp95('Niall', 'Neil'), 12)
        0.1545
        >>> round(dist_strcmp95('aluminum', 'Catalan'), 12)
        0.345238095238
        >>> round(dist_strcmp95('ATCG', 'TAGC'), 12)
        0.166666666667

    """
    return Strcmp95().dist(src, tar, long_strings)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2014-2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.distance._strcmp95.
20
21		The strcmp95 algorithm variant of Jaro-Winkler distance
22		"""
23
24	1	from __future__ import (
25		absolute_import,
26		division,
27		print_function,
28		unicode_literals,
29		)
30
31	1	from collections import defaultdict
32
33	1	from six.moves import range
34
35	1	from ._distance import _Distance
36
37	1	__all__ = ['Strcmp95', 'dist_strcmp95', 'sim_strcmp95']
38
39
40	1	class Strcmp95(_Distance):
		0 ignored issues – show Unused Code introduced 2018-11-10 01:42 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
41		"""Strcmp95.
42
43		This is a Python translation of the C code for strcmp95:
44		http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
45		:cite:`Winkler:1994`.
46		The above file is a US Government publication and, accordingly,
47		in the public domain.
48
49		This is based on the Jaro-Winkler distance, but also attempts to correct
50		for some common typos and frequently confused characters. It is also
51		limited to uppercase ASCII characters, so it is appropriate to American
52		names, but not much else.
53		"""
54
55	1	_sp_mx = (
56		('A', 'E'),
57		('A', 'I'),
58		('A', 'O'),
59		('A', 'U'),
60		('B', 'V'),
61		('E', 'I'),
62		('E', 'O'),
63		('E', 'U'),
64		('I', 'O'),
65		('I', 'U'),
66		('O', 'U'),
67		('I', 'Y'),
68		('E', 'Y'),
69		('C', 'G'),
70		('E', 'F'),
71		('W', 'U'),
72		('W', 'V'),
73		('X', 'K'),
74		('S', 'Z'),
75		('X', 'S'),
76		('Q', 'C'),
77		('U', 'V'),
78		('M', 'N'),
79		('L', 'I'),
80		('Q', 'O'),
81		('P', 'R'),
82		('I', 'J'),
83		('2', 'Z'),
84		('5', 'S'),
85		('8', 'B'),
86		('1', 'I'),
87		('1', 'L'),
88		('0', 'O'),
89		('0', 'Q'),
90		('C', 'K'),
91		('G', 'J'),
92		)
93
94	1	def sim(self, src, tar, long_strings=False):
		0 ignored issues – show Comprehensibility introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report This function exceeds the maximum number of variables (24/15). Loading history... Bug introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'sim' method Loading history...
95		"""Return the strcmp95 similarity of two strings.
96
97		Args:
98		src (str): Source string for comparison
99		tar (str): Target string for comparison
100		long_strings (bool): Set to True to increase the probability of a
101		match when the number of matched characters is large. This
102		option allows for a little more tolerance when the strings are
103		large. It is not an appropriate test when comparing fixed
104		length fields such as phone and social security numbers.
105
106		Returns:
107		float: Strcmp95 similarity
108
109		Examples:
110		>>> cmp = Strcmp95()
111		>>> cmp.sim('cat', 'hat')
112		0.7777777777777777
113		>>> cmp.sim('Niall', 'Neil')
114		0.8454999999999999
115		>>> cmp.sim('aluminum', 'Catalan')
116		0.6547619047619048
117		>>> cmp.sim('ATCG', 'TAGC')
118		0.8333333333333334
119
120		"""
121
122	1	def _in_range(char):
123		"""Return True if char is in the range (0, 91).
124
125		Args:
126		char (str): The character to check
127
128		Returns:
129		bool: True if char is in the range (0, 91)
130
131		"""
132	1	return 91 > ord(char) > 0
133
134	1	ying = src.strip().upper()
135	1	yang = tar.strip().upper()
136
137	1	if ying == yang:
138	1	return 1.0
139		# If either string is blank - return - added in Version 2
140	1	if not ying or not yang:
141	1	return 0.0
142
143	1	adjwt = defaultdict(int)
144
145		# Initialize the adjwt array on the first call to the function only.
146		# The adjwt array is used to give partial credit for characters that
147		# may be errors due to known phonetic or character recognition errors.
148		# A typical example is to match the letter "O" with the number "0"
149	1	for i in self._sp_mx:
150	1	adjwt[(i[0], i[1])] = 3
151	1	adjwt[(i[1], i[0])] = 3
152
153	1	if len(ying) > len(yang):
154	1	search_range = len(ying)
155	1	minv = len(yang)
156		else:
157	1	search_range = len(yang)
158	1	minv = len(ying)
159
160		# Blank out the flags
161	1	ying_flag = [0] * search_range
162	1	yang_flag = [0] * search_range
163	1	search_range = max(0, search_range // 2 - 1)
164
165		# Looking only within the search range,
166		# count and flag the matched pairs.
167	1	num_com = 0
168	1	yl1 = len(yang) - 1
169	1	for i in range(len(ying)):
		0 ignored issues – show unused-code introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report Consider using enumerate instead of iterating with range and len Loading history...
170	1	low_lim = (i - search_range) if (i >= search_range) else 0
171	1	hi_lim = (i + search_range) if ((i + search_range) <= yl1) else yl1
172	1	for j in range(low_lim, hi_lim + 1):
173	1	if (yang_flag[j] == 0) and (yang[j] == ying[i]):
174	1	yang_flag[j] = 1
175	1	ying_flag[i] = 1
176	1	num_com += 1
177	1	break
178
179		# If no characters in common - return
180	1	if num_com == 0:
181	1	return 0.0
182
183		# Count the number of transpositions
184	1	k = n_trans = 0
185	1	for i in range(len(ying)):
		0 ignored issues – show unused-code introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report Consider using enumerate instead of iterating with range and len Loading history...
186	1	if ying_flag[i] != 0:
187	1	j = 0
188	1	for j in range(k, len(yang)): # pragma: no branch
189	1	if yang_flag[j] != 0:
190	1	k = j + 1
191	1	break
192	1	if ying[i] != yang[j]:
193	1	n_trans += 1
194	1	n_trans //= 2
195
196		# Adjust for similarities in unmatched characters
197	1	n_simi = 0
198	1	if minv > num_com:
		0 ignored issues – show unused-code introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report Too many nested blocks (6/5) Loading history...
199	1	for i in range(len(ying)):
		0 ignored issues – show unused-code introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report Consider using enumerate instead of iterating with range and len Loading history...
200	1	if ying_flag[i] == 0 and _in_range(ying[i]):
201	1	for j in range(len(yang)):
		0 ignored issues – show unused-code introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report Consider using enumerate instead of iterating with range and len Loading history...
202	1	if yang_flag[j] == 0 and _in_range(yang[j]):
203	1	if (ying[i], yang[j]) in adjwt:
204	1	n_simi += adjwt[(ying[i], yang[j])]
205	1	yang_flag[j] = 2
206	1	break
207	1	num_sim = n_simi / 10.0 + num_com
208
209		# Main weight computation
210	1	weight = (
211		num_sim / len(ying)
212		+ num_sim / len(yang)
213		+ (num_com - n_trans) / num_com
214		)
215	1	weight /= 3.0
216
217		# Continue to boost the weight if the strings are similar
218	1	if weight > 0.7:
219
220		# Adjust for having up to the first 4 characters in common
221	1	j = 4 if (minv >= 4) else minv
222	1	i = 0
223	1	while (i < j) and (ying[i] == yang[i]) and (not ying[i].isdigit()):
224	1	i += 1
225	1	if i:
226	1	weight += i * 0.1 * (1.0 - weight)
227
228		# Optionally adjust for long strings.
229
230		# After agreeing beginning chars, at least two more must agree and
231		# the agreeing characters must be > .5 of remaining characters.
232	1	if (
233		long_strings
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
234		and (minv > 4)
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
235		and (num_com > i + 1)
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
236		and (2 * num_com >= minv + i)
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
237		):
238	1	if not ying[0].isdigit():
239	1	weight += (1.0 - weight) * (
240		(num_com - i - 1) / (len(ying) + len(yang) - i * 2 + 2)
241		)
242
243	1	return weight
244
245
246	1	def sim_strcmp95(src, tar, long_strings=False):
247		"""Return the strcmp95 similarity of two strings.
248
249		This is a wrapper for :py:meth:`Strcmp95.sim`.
250
251		Args:
252		src (str): Source string for comparison
253		tar (str): Target string for comparison
254		long_strings (bool): Set to True to increase the probability of a
255		match when the number of matched characters is large. This option
256		allows for a little more tolerance when the strings are large. It
257		is not an appropriate test when comparing fixed length fields such
258		as phone and social security numbers.
259
260		Returns:
261		float: strcmp95 similarity
262
263		Examples:
264		>>> sim_strcmp95('cat', 'hat')
265		0.7777777777777777
266		>>> sim_strcmp95('Niall', 'Neil')
267		0.8454999999999999
268		>>> sim_strcmp95('aluminum', 'Catalan')
269		0.6547619047619048
270		>>> sim_strcmp95('ATCG', 'TAGC')
271		0.8333333333333334
272
273		"""
274	1	return Strcmp95().sim(src, tar, long_strings)
275
276
277	1	def dist_strcmp95(src, tar, long_strings=False):
278		"""Return the strcmp95 distance between two strings.
279
280		This is a wrapper for :py:meth:`Strcmp95.dist`.
281
282		Args:
283		src (str): Source string for comparison
284		tar (str): Target string for comparison
285		long_strings (bool): Set to True to increase the probability of a
286		match when the number of matched characters is large. This option
287		allows for a little more tolerance when the strings are large. It
288		is not an appropriate test when comparing fixed length fields such
289		as phone and social security numbers.
290
291		Returns:
292		float: strcmp95 distance
293
294		Examples:
295		>>> round(dist_strcmp95('cat', 'hat'), 12)
296		0.222222222222
297		>>> round(dist_strcmp95('Niall', 'Neil'), 12)
298		0.1545
299		>>> round(dist_strcmp95('aluminum', 'Catalan'), 12)
300		0.345238095238
301		>>> round(dist_strcmp95('ATCG', 'TAGC'), 12)
302		0.166666666667
303
304		"""
305	1	return Strcmp95().dist(src, tar, long_strings)
306
307
308		if __name__ == '__main__':
309		import doctest
310
311		doctest.testmod()
312

chrislit / abydos

Pull Request — master (#141)

abydos.distance._strcmp95.sim_strcmp95() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like