abydos.distance._strcmp95 - Code Metrics - Inspection of "Merge pull request #149 from chrislit/0.3.6" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( f43547...71985b )

by Chris

created 2018-11-17 08:52 UTC

abydos.distance._strcmp95 A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	337
Duplicated Lines	0 %

Test Coverage

Coverage

100%

Importance

Changes

Metric	Value
eloc	132
dl	0
loc	337
ccs	81
cts	81
cp	1
rs	9.28
c	0
b	0
f	0
wmc	39

2 Functions

Rating	Name	Duplication	Size	Complexity
A	dist_strcmp95()	0	36	1
A	sim_strcmp95()	0	36	1

1 Method

Rating	Name	Duplication	Size	Complexity
F	Strcmp95.sim()	0	161	37

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._strcmp95.

The strcmp95 algorithm variant of Jaro-Winkler distance
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from collections import defaultdict

from six.moves import range

from ._distance import _Distance

__all__ = ['Strcmp95', 'dist_strcmp95', 'sim_strcmp95']


class Strcmp95(_Distance):

    """Strcmp95.

    This is a Python translation of the C code for strcmp95:
    http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
    :cite:`Winkler:1994`.
    The above file is a US Government publication and, accordingly,
    in the public domain.

    This is based on the Jaro-Winkler distance, but also attempts to correct
    for some common typos and frequently confused characters. It is also
    limited to uppercase ASCII characters, so it is appropriate to American
    names, but not much else.
    """

    _sp_mx = (
        ('A', 'E'),
        ('A', 'I'),
        ('A', 'O'),
        ('A', 'U'),
        ('B', 'V'),
        ('E', 'I'),
        ('E', 'O'),
        ('E', 'U'),
        ('I', 'O'),
        ('I', 'U'),
        ('O', 'U'),
        ('I', 'Y'),
        ('E', 'Y'),
        ('C', 'G'),
        ('E', 'F'),
        ('W', 'U'),
        ('W', 'V'),
        ('X', 'K'),
        ('S', 'Z'),
        ('X', 'S'),
        ('Q', 'C'),
        ('U', 'V'),
        ('M', 'N'),
        ('L', 'I'),
        ('Q', 'O'),
        ('P', 'R'),
        ('I', 'J'),
        ('2', 'Z'),
        ('5', 'S'),
        ('8', 'B'),
        ('1', 'I'),
        ('1', 'L'),
        ('0', 'O'),
        ('0', 'Q'),
        ('C', 'K'),
        ('G', 'J'),
    )

    def sim(self, src, tar, long_strings=False):

        """Return the strcmp95 similarity of two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison
        long_strings : bool
            Set to True to increase the probability of a match when the number
            of matched characters is large. This option allows for a little
            more tolerance when the strings are large. It is not an appropriate
            test when comparing fixed length fields such as phone and social
            security numbers.

        Returns
        -------
        float
            Strcmp95 similarity

        Examples
        --------
        >>> cmp = Strcmp95()
        >>> cmp.sim('cat', 'hat')
        0.7777777777777777
        >>> cmp.sim('Niall', 'Neil')
        0.8454999999999999
        >>> cmp.sim('aluminum', 'Catalan')
        0.6547619047619048
        >>> cmp.sim('ATCG', 'TAGC')
        0.8333333333333334

        """

        def _in_range(char):
            """Return True if char is in the range (0, 91).

            Parameters
            ----------
            char : str
                The character to check

            Returns
            -------
            bool
                True if char is in the range (0, 91)

            """
            return 91 > ord(char) > 0

        ying = src.strip().upper()
        yang = tar.strip().upper()

        if ying == yang:
            return 1.0
        # If either string is blank - return - added in Version 2
        if not ying or not yang:
            return 0.0

        adjwt = defaultdict(int)

        # Initialize the adjwt array on the first call to the function only.
        # The adjwt array is used to give partial credit for characters that
        # may be errors due to known phonetic or character recognition errors.
        # A typical example is to match the letter "O" with the number "0"
        for i in self._sp_mx:
            adjwt[(i[0], i[1])] = 3
            adjwt[(i[1], i[0])] = 3

        if len(ying) > len(yang):
            search_range = len(ying)
            minv = len(yang)
        else:
            search_range = len(yang)
            minv = len(ying)

        # Blank out the flags
        ying_flag = [0] * search_range
        yang_flag = [0] * search_range
        search_range = max(0, search_range // 2 - 1)

        # Looking only within the search range,
        # count and flag the matched pairs.
        num_com = 0
        yl1 = len(yang) - 1
        for i in range(len(ying)):

            low_lim = (i - search_range) if (i >= search_range) else 0
            hi_lim = (i + search_range) if ((i + search_range) <= yl1) else yl1
            for j in range(low_lim, hi_lim + 1):
                if (yang_flag[j] == 0) and (yang[j] == ying[i]):
                    yang_flag[j] = 1
                    ying_flag[i] = 1
                    num_com += 1
                    break

        # If no characters in common - return
        if num_com == 0:
            return 0.0

        # Count the number of transpositions
        k = n_trans = 0
        for i in range(len(ying)):

            if ying_flag[i] != 0:
                j = 0
                for j in range(k, len(yang)):  # pragma: no branch
                    if yang_flag[j] != 0:
                        k = j + 1
                        break
                if ying[i] != yang[j]:
                    n_trans += 1
        n_trans //= 2

        # Adjust for similarities in unmatched characters
        n_simi = 0
        if minv > num_com:

            for i in range(len(ying)):

                if ying_flag[i] == 0 and _in_range(ying[i]):
                    for j in range(len(yang)):

                        if yang_flag[j] == 0 and _in_range(yang[j]):
                            if (ying[i], yang[j]) in adjwt:
                                n_simi += adjwt[(ying[i], yang[j])]
                                yang_flag[j] = 2
                                break
        num_sim = n_simi / 10.0 + num_com

        # Main weight computation
        weight = (
            num_sim / len(ying)
            + num_sim / len(yang)
            + (num_com - n_trans) / num_com
        )
        weight /= 3.0

        # Continue to boost the weight if the strings are similar
        if weight > 0.7:

            # Adjust for having up to the first 4 characters in common
            j = 4 if (minv >= 4) else minv
            i = 0
            while (i < j) and (ying[i] == yang[i]) and (not ying[i].isdigit()):
                i += 1
            if i:
                weight += i * 0.1 * (1.0 - weight)

            # Optionally adjust for long strings.

            # After agreeing beginning chars, at least two more must agree and
            # the agreeing characters must be > .5 of remaining characters.
            if (
                long_strings

                and (minv > 4)

                and (num_com > i + 1)

                and (2 * num_com >= minv + i)

            ):
                if not ying[0].isdigit():
                    weight += (1.0 - weight) * (
                        (num_com - i - 1) / (len(ying) + len(yang) - i * 2 + 2)
                    )

        return weight


def sim_strcmp95(src, tar, long_strings=False):
    """Return the strcmp95 similarity of two strings.

    This is a wrapper for :py:meth:`Strcmp95.sim`.

    Parameters
    ----------
    src : str
        Source string for comparison
    tar : str
        Target string for comparison
    long_strings : bool
        Set to True to increase the probability of a match when the number of
        matched characters is large. This option allows for a little more
        tolerance when the strings are large. It is not an appropriate test
        when comparing fixed length fields such as phone and social security
        numbers.

    Returns
    -------
    float
        Strcmp95 similarity

    Examples
    --------
    >>> sim_strcmp95('cat', 'hat')
    0.7777777777777777
    >>> sim_strcmp95('Niall', 'Neil')
    0.8454999999999999
    >>> sim_strcmp95('aluminum', 'Catalan')
    0.6547619047619048
    >>> sim_strcmp95('ATCG', 'TAGC')
    0.8333333333333334

    """
    return Strcmp95().sim(src, tar, long_strings)


def dist_strcmp95(src, tar, long_strings=False):
    """Return the strcmp95 distance between two strings.

    This is a wrapper for :py:meth:`Strcmp95.dist`.

    Parameters
    ----------
    src : str
        Source string for comparison
    tar : str
        Target string for comparison
    long_strings : bool
        Set to True to increase the probability of a match when the number of
        matched characters is large. This option allows for a little more
        tolerance when the strings are large. It is not an appropriate test
        when comparing fixed length fields such as phone and social security
        numbers.

    Returns
    -------
    float
        Strcmp95 distance

    Examples
    --------
    >>> round(dist_strcmp95('cat', 'hat'), 12)
    0.222222222222
    >>> round(dist_strcmp95('Niall', 'Neil'), 12)
    0.1545
    >>> round(dist_strcmp95('aluminum', 'Catalan'), 12)
    0.345238095238
    >>> round(dist_strcmp95('ATCG', 'TAGC'), 12)
    0.166666666667

    """
    return Strcmp95().dist(src, tar, long_strings)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2014-2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.distance._strcmp95.
20
21		The strcmp95 algorithm variant of Jaro-Winkler distance
22		"""
23
24	1	from __future__ import (
25		absolute_import,
26		division,
27		print_function,
28		unicode_literals,
29		)
30
31	1	from collections import defaultdict
32
33	1	from six.moves import range
34
35	1	from ._distance import _Distance
36
37	1	__all__ = ['Strcmp95', 'dist_strcmp95', 'sim_strcmp95']
38
39
40	1	class Strcmp95(_Distance):
		0 ignored issues – show Unused Code introduced 2018-11-10 01:42 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
41		"""Strcmp95.
42
43		This is a Python translation of the C code for strcmp95:
44		http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
45		:cite:`Winkler:1994`.
46		The above file is a US Government publication and, accordingly,
47		in the public domain.
48
49		This is based on the Jaro-Winkler distance, but also attempts to correct
50		for some common typos and frequently confused characters. It is also
51		limited to uppercase ASCII characters, so it is appropriate to American
52		names, but not much else.
53		"""
54
55	1	_sp_mx = (
56		('A', 'E'),
57		('A', 'I'),
58		('A', 'O'),
59		('A', 'U'),
60		('B', 'V'),
61		('E', 'I'),
62		('E', 'O'),
63		('E', 'U'),
64		('I', 'O'),
65		('I', 'U'),
66		('O', 'U'),
67		('I', 'Y'),
68		('E', 'Y'),
69		('C', 'G'),
70		('E', 'F'),
71		('W', 'U'),
72		('W', 'V'),
73		('X', 'K'),
74		('S', 'Z'),
75		('X', 'S'),
76		('Q', 'C'),
77		('U', 'V'),
78		('M', 'N'),
79		('L', 'I'),
80		('Q', 'O'),
81		('P', 'R'),
82		('I', 'J'),
83		('2', 'Z'),
84		('5', 'S'),
85		('8', 'B'),
86		('1', 'I'),
87		('1', 'L'),
88		('0', 'O'),
89		('0', 'Q'),
90		('C', 'K'),
91		('G', 'J'),
92		)
93
94	1	def sim(self, src, tar, long_strings=False):
		0 ignored issues – show Comprehensibility introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report This function exceeds the maximum number of variables (24/15). Loading history... Bug introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'sim' method Loading history...
95		"""Return the strcmp95 similarity of two strings.
96
97		Parameters
98		----------
99		src : str
100		Source string for comparison
101		tar : str
102		Target string for comparison
103		long_strings : bool
104		Set to True to increase the probability of a match when the number
105		of matched characters is large. This option allows for a little
106		more tolerance when the strings are large. It is not an appropriate
107		test when comparing fixed length fields such as phone and social
108		security numbers.
109
110		Returns
111		-------
112		float
113		Strcmp95 similarity
114
115		Examples
116		--------
117		>>> cmp = Strcmp95()
118		>>> cmp.sim('cat', 'hat')
119		0.7777777777777777
120		>>> cmp.sim('Niall', 'Neil')
121		0.8454999999999999
122		>>> cmp.sim('aluminum', 'Catalan')
123		0.6547619047619048
124		>>> cmp.sim('ATCG', 'TAGC')
125		0.8333333333333334
126
127		"""
128
129	1	def _in_range(char):
130		"""Return True if char is in the range (0, 91).
131
132		Parameters
133		----------
134		char : str
135		The character to check
136
137		Returns
138		-------
139		bool
140		True if char is in the range (0, 91)
141
142		"""
143	1	return 91 > ord(char) > 0
144
145	1	ying = src.strip().upper()
146	1	yang = tar.strip().upper()
147
148	1	if ying == yang:
149	1	return 1.0
150		# If either string is blank - return - added in Version 2
151	1	if not ying or not yang:
152	1	return 0.0
153
154	1	adjwt = defaultdict(int)
155
156		# Initialize the adjwt array on the first call to the function only.
157		# The adjwt array is used to give partial credit for characters that
158		# may be errors due to known phonetic or character recognition errors.
159		# A typical example is to match the letter "O" with the number "0"
160	1	for i in self._sp_mx:
161	1	adjwt[(i[0], i[1])] = 3
162	1	adjwt[(i[1], i[0])] = 3
163
164	1	if len(ying) > len(yang):
165	1	search_range = len(ying)
166	1	minv = len(yang)
167		else:
168	1	search_range = len(yang)
169	1	minv = len(ying)
170
171		# Blank out the flags
172	1	ying_flag = [0] * search_range
173	1	yang_flag = [0] * search_range
174	1	search_range = max(0, search_range // 2 - 1)
175
176		# Looking only within the search range,
177		# count and flag the matched pairs.
178	1	num_com = 0
179	1	yl1 = len(yang) - 1
180	1	for i in range(len(ying)):
		0 ignored issues – show unused-code introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report Consider using enumerate instead of iterating with range and len Loading history...
181	1	low_lim = (i - search_range) if (i >= search_range) else 0
182	1	hi_lim = (i + search_range) if ((i + search_range) <= yl1) else yl1
183	1	for j in range(low_lim, hi_lim + 1):
184	1	if (yang_flag[j] == 0) and (yang[j] == ying[i]):
185	1	yang_flag[j] = 1
186	1	ying_flag[i] = 1
187	1	num_com += 1
188	1	break
189
190		# If no characters in common - return
191	1	if num_com == 0:
192	1	return 0.0
193
194		# Count the number of transpositions
195	1	k = n_trans = 0
196	1	for i in range(len(ying)):
		0 ignored issues – show unused-code introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report Consider using enumerate instead of iterating with range and len Loading history...
197	1	if ying_flag[i] != 0:
198	1	j = 0
199	1	for j in range(k, len(yang)): # pragma: no branch
200	1	if yang_flag[j] != 0:
201	1	k = j + 1
202	1	break
203	1	if ying[i] != yang[j]:
204	1	n_trans += 1
205	1	n_trans //= 2
206
207		# Adjust for similarities in unmatched characters
208	1	n_simi = 0
209	1	if minv > num_com:
		0 ignored issues – show unused-code introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report Too many nested blocks (6/5) Loading history...
210	1	for i in range(len(ying)):
		0 ignored issues – show unused-code introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report Consider using enumerate instead of iterating with range and len Loading history...
211	1	if ying_flag[i] == 0 and _in_range(ying[i]):
212	1	for j in range(len(yang)):
		0 ignored issues – show unused-code introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report Consider using enumerate instead of iterating with range and len Loading history...
213	1	if yang_flag[j] == 0 and _in_range(yang[j]):
214	1	if (ying[i], yang[j]) in adjwt:
215	1	n_simi += adjwt[(ying[i], yang[j])]
216	1	yang_flag[j] = 2
217	1	break
218	1	num_sim = n_simi / 10.0 + num_com
219
220		# Main weight computation
221	1	weight = (
222		num_sim / len(ying)
223		+ num_sim / len(yang)
224		+ (num_com - n_trans) / num_com
225		)
226	1	weight /= 3.0
227
228		# Continue to boost the weight if the strings are similar
229	1	if weight > 0.7:
230
231		# Adjust for having up to the first 4 characters in common
232	1	j = 4 if (minv >= 4) else minv
233	1	i = 0
234	1	while (i < j) and (ying[i] == yang[i]) and (not ying[i].isdigit()):
235	1	i += 1
236	1	if i:
237	1	weight += i * 0.1 * (1.0 - weight)
238
239		# Optionally adjust for long strings.
240
241		# After agreeing beginning chars, at least two more must agree and
242		# the agreeing characters must be > .5 of remaining characters.
243	1	if (
244		long_strings
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
245		and (minv > 4)
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
246		and (num_com > i + 1)
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
247		and (2 * num_com >= minv + i)
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
248		):
249	1	if not ying[0].isdigit():
250	1	weight += (1.0 - weight) * (
251		(num_com - i - 1) / (len(ying) + len(yang) - i * 2 + 2)
252		)
253
254	1	return weight
255
256
257	1	def sim_strcmp95(src, tar, long_strings=False):
258		"""Return the strcmp95 similarity of two strings.
259
260		This is a wrapper for :py:meth:`Strcmp95.sim`.
261
262		Parameters
263		----------
264		src : str
265		Source string for comparison
266		tar : str
267		Target string for comparison
268		long_strings : bool
269		Set to True to increase the probability of a match when the number of
270		matched characters is large. This option allows for a little more
271		tolerance when the strings are large. It is not an appropriate test
272		when comparing fixed length fields such as phone and social security
273		numbers.
274
275		Returns
276		-------
277		float
278		Strcmp95 similarity
279
280		Examples
281		--------
282		>>> sim_strcmp95('cat', 'hat')
283		0.7777777777777777
284		>>> sim_strcmp95('Niall', 'Neil')
285		0.8454999999999999
286		>>> sim_strcmp95('aluminum', 'Catalan')
287		0.6547619047619048
288		>>> sim_strcmp95('ATCG', 'TAGC')
289		0.8333333333333334
290
291		"""
292	1	return Strcmp95().sim(src, tar, long_strings)
293
294
295	1	def dist_strcmp95(src, tar, long_strings=False):
296		"""Return the strcmp95 distance between two strings.
297
298		This is a wrapper for :py:meth:`Strcmp95.dist`.
299
300		Parameters
301		----------
302		src : str
303		Source string for comparison
304		tar : str
305		Target string for comparison
306		long_strings : bool
307		Set to True to increase the probability of a match when the number of
308		matched characters is large. This option allows for a little more
309		tolerance when the strings are large. It is not an appropriate test
310		when comparing fixed length fields such as phone and social security
311		numbers.
312
313		Returns
314		-------
315		float
316		Strcmp95 distance
317
318		Examples
319		--------
320		>>> round(dist_strcmp95('cat', 'hat'), 12)
321		0.222222222222
322		>>> round(dist_strcmp95('Niall', 'Neil'), 12)
323		0.1545
324		>>> round(dist_strcmp95('aluminum', 'Catalan'), 12)
325		0.345238095238
326		>>> round(dist_strcmp95('ATCG', 'TAGC'), 12)
327		0.166666666667
328
329		"""
330	1	return Strcmp95().dist(src, tar, long_strings)
331
332
333		if __name__ == '__main__':
334		import doctest
335
336		doctest.testmod()
337

chrislit / abydos

Push — master ( f43547...71985b )

abydos.distance._strcmp95 A

Complexity

Size/Duplication

Test Coverage

Importance

2 Functions

1 Method

Duplication Side-by-Side

Filter issues like