abydos.distance._strcmp95 - Code Metrics - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

abydos.distance._strcmp95 A
last analyzed 2020-12-31 20:10 UTC

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	275
Duplicated Lines	0 %

Test Coverage

Coverage

100%

Importance

Changes

Metric	Value
wmc	38
eloc	126
dl	0
loc	275
ccs	79
cts	79
cp	1
rs	9.36
c	0
b	0
f	0

2 Methods

Rating	Name	Duplication	Size	Complexity
A	Strcmp95.__init__()	0	20	1
F	Strcmp95.sim()	0	162	37

# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._strcmp95.

The strcmp95 algorithm variant of Jaro-Winkler distance
"""

from collections import defaultdict
from typing import Any, DefaultDict, Tuple

from ._distance import _Distance

__all__ = ['Strcmp95']


class Strcmp95(_Distance):
    """Strcmp95.

    This is a Python translation of the C code for strcmp95:
    http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
    :cite:`Winkler:1994`.
    The above file is a US Government publication and, accordingly,
    in the public domain.

    This is based on the Jaro-Winkler distance, but also attempts to correct
    for some common typos and frequently confused characters. It is also
    limited to uppercase ASCII characters, so it is appropriate to American
    names, but not much else.

    .. versionadded:: 0.3.6
    """

    _sp_mx = (
        ('A', 'E'),
        ('A', 'I'),
        ('A', 'O'),
        ('A', 'U'),
        ('B', 'V'),
        ('E', 'I'),
        ('E', 'O'),
        ('E', 'U'),
        ('I', 'O'),
        ('I', 'U'),
        ('O', 'U'),
        ('I', 'Y'),
        ('E', 'Y'),
        ('C', 'G'),
        ('E', 'F'),
        ('W', 'U'),
        ('W', 'V'),
        ('X', 'K'),
        ('S', 'Z'),
        ('X', 'S'),
        ('Q', 'C'),
        ('U', 'V'),
        ('M', 'N'),
        ('L', 'I'),
        ('Q', 'O'),
        ('P', 'R'),
        ('I', 'J'),
        ('2', 'Z'),
        ('5', 'S'),
        ('8', 'B'),
        ('1', 'I'),
        ('1', 'L'),
        ('0', 'O'),
        ('0', 'Q'),
        ('C', 'K'),
        ('G', 'J'),
    )

    def __init__(self, long_strings: bool = False, **kwargs: Any) -> None:
        """Initialize Strcmp95 instance.

        Parameters
        ----------
        long_strings : bool
            Set to True to increase the probability of a match when the number
            of matched characters is large. This option allows for a little
            more tolerance when the strings are large. It is not an appropriate
            test when comparing fixed length fields such as phone and social
            security numbers.
        **kwargs
            Arbitrary keyword arguments


        .. versionadded:: 0.4.0

        """
        super(Strcmp95, self).__init__(**kwargs)
        self._long_strings = long_strings

    def sim(self, src: str, tar: str) -> float:
        """Return the strcmp95 similarity of two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            Strcmp95 similarity

        Examples
        --------
        >>> cmp = Strcmp95()
        >>> cmp.sim('cat', 'hat')
        0.7777777777777777
        >>> cmp.sim('Niall', 'Neil')
        0.8454999999999999
        >>> cmp.sim('aluminum', 'Catalan')
        0.6547619047619048
        >>> cmp.sim('ATCG', 'TAGC')
        0.8333333333333334


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """

        def _in_range(char: str) -> bool:
            """Return True if char is in the range (0, 91).

            Parameters
            ----------
            char : str
                The character to check

            Returns
            -------
            bool
                True if char is in the range (0, 91)

            .. versionadded:: 0.1.0

            """
            return 91 > ord(char) > 0

        ying = src.strip().upper()
        yang = tar.strip().upper()

        if ying == yang:
            return 1.0
        # If either string is blank - return - added in Version 2
        if not ying or not yang:
            return 0.0

        adjwt = defaultdict(int)  # type: DefaultDict[Tuple[str, str], int]

        # Initialize the adjwt array on the first call to the function only.
        # The adjwt array is used to give partial credit for characters that
        # may be errors due to known phonetic or character recognition errors.
        # A typical example is to match the letter "O" with the number "0"
        for tup in self._sp_mx:
            adjwt[(tup[0], tup[1])] = 3
            adjwt[(tup[1], tup[0])] = 3

        if len(ying) > len(yang):
            search_range = len(ying)
            minv = len(yang)
        else:
            search_range = len(yang)
            minv = len(ying)

        # Blank out the flags
        ying_flag = [0] * search_range
        yang_flag = [0] * search_range
        search_range = max(0, search_range // 2 - 1)

        # Looking only within the search range,
        # count and flag the matched pairs.
        num_com = 0
        yl1 = len(yang) - 1
        for i in range(len(ying)):
            low_lim = (i - search_range) if (i >= search_range) else 0
            hi_lim = (i + search_range) if ((i + search_range) <= yl1) else yl1
            for j in range(low_lim, hi_lim + 1):
                if (yang_flag[j] == 0) and (yang[j] == ying[i]):
                    yang_flag[j] = 1
                    ying_flag[i] = 1
                    num_com += 1
                    break

        # If no characters in common - return
        if num_com == 0:
            return 0.0

        # Count the number of transpositions
        k = n_trans = 0
        for i in range(len(ying)):
            if ying_flag[i] != 0:
                j = 0
                for j in range(k, len(yang)):  # pragma: no branch
                    if yang_flag[j] != 0:
                        k = j + 1
                        break
                if ying[i] != yang[j]:
                    n_trans += 1
        n_trans //= 2

        # Adjust for similarities in unmatched characters
        n_simi = 0
        if minv > num_com:
            for i in range(len(ying)):
                if ying_flag[i] == 0 and _in_range(ying[i]):
                    for j in range(len(yang)):
                        if yang_flag[j] == 0 and _in_range(yang[j]):
                            if (ying[i], yang[j]) in adjwt:
                                n_simi += adjwt[(ying[i], yang[j])]
                                yang_flag[j] = 2
                                break
        num_sim = n_simi / 10.0 + num_com

        # Main weight computation
        weight = (
            num_sim / len(ying)
            + num_sim / len(yang)
            + (num_com - n_trans) / num_com
        )
        weight /= 3.0

        # Continue to boost the weight if the strings are similar
        if weight > 0.7:

            # Adjust for having up to the first 4 characters in common
            j = 4 if (minv >= 4) else minv
            i = 0
            while (i < j) and (ying[i] == yang[i]) and (not ying[i].isdigit()):
                i += 1
            if i:
                weight += i * 0.1 * (1.0 - weight)

            # Optionally adjust for long strings.

            # After agreeing beginning chars, at least two more must agree and
            # the agreeing characters must be > .5 of remaining characters.
            if (
                self._long_strings
                and (minv > 4)
                and (num_com > i + 1)
                and (2 * num_com >= minv + i)
            ):
                if not ying[0].isdigit():
                    weight += (1.0 - weight) * (
                        (num_com - i - 1) / (len(ying) + len(yang) - i * 2 + 2)
                    )

        return weight


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# Copyright 2014-2020 by Christopher C. Little.
2		# This file is part of Abydos.
3		#
4		# Abydos is free software: you can redistribute it and/or modify
5		# it under the terms of the GNU General Public License as published by
6		# the Free Software Foundation, either version 3 of the License, or
7		# (at your option) any later version.
8		#
9		# Abydos is distributed in the hope that it will be useful,
10		# but WITHOUT ANY WARRANTY; without even the implied warranty of
11		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12		# GNU General Public License for more details.
13		#
14		# You should have received a copy of the GNU General Public License
15		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17		"""abydos.distance._strcmp95.
18
19	1	The strcmp95 algorithm variant of Jaro-Winkler distance
20		"""
21
22		from collections import defaultdict
23		from typing import Any, DefaultDict, Tuple
24	1
25		from ._distance import _Distance
26
27		__all__ = ['Strcmp95']
28
29
30		class Strcmp95(_Distance):
31	1	"""Strcmp95.
32
33	1	This is a Python translation of the C code for strcmp95:
34		http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
35	1	:cite:`Winkler:1994`.
36		The above file is a US Government publication and, accordingly,
37	1	in the public domain.
38	1
39		This is based on the Jaro-Winkler distance, but also attempts to correct
40	1	for some common typos and frequently confused characters. It is also
41		limited to uppercase ASCII characters, so it is appropriate to American
42		names, but not much else.
43	1
44		.. versionadded:: 0.3.6
45		"""
46
47		_sp_mx = (
48		('A', 'E'),
49		('A', 'I'),
50		('A', 'O'),
51		('A', 'U'),
52		('B', 'V'),
53		('E', 'I'),
54		('E', 'O'),
55		('E', 'U'),
56		('I', 'O'),
57		('I', 'U'),
58		('O', 'U'),
59		('I', 'Y'),
60	1	('E', 'Y'),
61		('C', 'G'),
62		('E', 'F'),
63		('W', 'U'),
64		('W', 'V'),
65		('X', 'K'),
66		('S', 'Z'),
67		('X', 'S'),
68		('Q', 'C'),
69		('U', 'V'),
70		('M', 'N'),
71		('L', 'I'),
72		('Q', 'O'),
73		('P', 'R'),
74		('I', 'J'),
75		('2', 'Z'),
76		('5', 'S'),
77		('8', 'B'),
78		('1', 'I'),
79		('1', 'L'),
80		('0', 'O'),
81		('0', 'Q'),
82		('C', 'K'),
83		('G', 'J'),
84		)
85
86		def __init__(self, long_strings: bool = False, **kwargs: Any) -> None:
87		"""Initialize Strcmp95 instance.
88
89		Parameters
90		----------
91		long_strings : bool
92		Set to True to increase the probability of a match when the number
93		of matched characters is large. This option allows for a little
94		more tolerance when the strings are large. It is not an appropriate
95		test when comparing fixed length fields such as phone and social
96		security numbers.
97		**kwargs
98		Arbitrary keyword arguments
99	1
100
101		.. versionadded:: 0.4.0
102
103		"""
104		super(Strcmp95, self).__init__(**kwargs)
105		self._long_strings = long_strings
106
107		def sim(self, src: str, tar: str) -> float:
108		"""Return the strcmp95 similarity of two strings.
109
110		Parameters
111		----------
112		src : str
113		Source string for comparison
114		tar : str
115		Target string for comparison
116
117	1	Returns
118	1	-------
119		float
120	1	Strcmp95 similarity
121
122		Examples
123		--------
124		>>> cmp = Strcmp95()
125		>>> cmp.sim('cat', 'hat')
126		0.7777777777777777
127		>>> cmp.sim('Niall', 'Neil')
128		0.8454999999999999
129		>>> cmp.sim('aluminum', 'Catalan')
130		0.6547619047619048
131		>>> cmp.sim('ATCG', 'TAGC')
132		0.8333333333333334
133
134
135		.. versionadded:: 0.1.0
136		.. versionchanged:: 0.3.6
137		Encapsulated in class
138
139		"""
140
141		def _in_range(char: str) -> bool:
142		"""Return True if char is in the range (0, 91).
143
144		Parameters
145		----------
146		char : str
147		The character to check
148
149		Returns
150		-------
151		bool
152		True if char is in the range (0, 91)
153
154	1	.. versionadded:: 0.1.0
155
156		"""
157		return 91 > ord(char) > 0
158
159		ying = src.strip().upper()
160		yang = tar.strip().upper()
161
162		if ying == yang:
163		return 1.0
164		# If either string is blank - return - added in Version 2
165		if not ying or not yang:
166		return 0.0
167
168		adjwt = defaultdict(int) # type: DefaultDict[Tuple[str, str], int]
169
170	1	# Initialize the adjwt array on the first call to the function only.
171		# The adjwt array is used to give partial credit for characters that
172	1	# may be errors due to known phonetic or character recognition errors.
173	1	# A typical example is to match the letter "O" with the number "0"
174		for tup in self._sp_mx:
175	1	adjwt[(tup[0], tup[1])] = 3
176	1	adjwt[(tup[1], tup[0])] = 3
177
178	1	if len(ying) > len(yang):
179	1	search_range = len(ying)
180		minv = len(yang)
181	1	else:
182		search_range = len(yang)
183		minv = len(ying)
184
185		# Blank out the flags
186		ying_flag = [0] * search_range
187	1	yang_flag = [0] * search_range
188	1	search_range = max(0, search_range // 2 - 1)
189	1
190		# Looking only within the search range,
191	1	# count and flag the matched pairs.
192	1	num_com = 0
193	1	yl1 = len(yang) - 1
194		for i in range(len(ying)):
195	1	low_lim = (i - search_range) if (i >= search_range) else 0
196	1	hi_lim = (i + search_range) if ((i + search_range) <= yl1) else yl1
197		for j in range(low_lim, hi_lim + 1):
198		if (yang_flag[j] == 0) and (yang[j] == ying[i]):
199	1	yang_flag[j] = 1
200	1	ying_flag[i] = 1
201	1	num_com += 1
202		break
203
204		# If no characters in common - return
205	1	if num_com == 0:
206	1	return 0.0
207	1
208	1	# Count the number of transpositions
209	1	k = n_trans = 0
210	1	for i in range(len(ying)):
211	1	if ying_flag[i] != 0:
212	1	j = 0
213	1	for j in range(k, len(yang)): # pragma: no branch
214	1	if yang_flag[j] != 0:
215	1	k = j + 1
216		break
217		if ying[i] != yang[j]:
218	1	n_trans += 1
219	1	n_trans //= 2
220
221		# Adjust for similarities in unmatched characters
222	1	n_simi = 0
223	1	if minv > num_com:
224	1	for i in range(len(ying)):
225	1	if ying_flag[i] == 0 and _in_range(ying[i]):
226	1	for j in range(len(yang)):
227	1	if yang_flag[j] == 0 and _in_range(yang[j]):
228	1	if (ying[i], yang[j]) in adjwt:
229	1	n_simi += adjwt[(ying[i], yang[j])]
230	1	yang_flag[j] = 2
231	1	break
232	1	num_sim = n_simi / 10.0 + num_com
233
234		# Main weight computation
235	1	weight = (
236	1	num_sim / len(ying)
237	1	+ num_sim / len(yang)
238	1	+ (num_com - n_trans) / num_com
239	1	)
240	1	weight /= 3.0
241	1
242	1	# Continue to boost the weight if the strings are similar
243	1	if weight > 0.7:
244	1
245	1	# Adjust for having up to the first 4 characters in common
246		j = 4 if (minv >= 4) else minv
247		i = 0
248	1	while (i < j) and (ying[i] == yang[i]) and (not ying[i].isdigit()):
249		i += 1
250		if i:
251		weight += i * 0.1 * (1.0 - weight)
252
253	1	# Optionally adjust for long strings.
254
255		# After agreeing beginning chars, at least two more must agree and
256	1	# the agreeing characters must be > .5 of remaining characters.
257		if (
258		self._long_strings
259	1	and (minv > 4)
260	1	and (num_com > i + 1)
261	1	and (2 * num_com >= minv + i)
262	1	):
263	1	if not ying[0].isdigit():
264	1	weight += (1.0 - weight) * (
265		(num_com - i - 1) / (len(ying) + len(yang) - i * 2 + 2)
266		)
267
268		return weight
269
270	1
271		if __name__ == '__main__':
272		import doctest
273
274		doctest.testmod()
275

chrislit / abydos

abydos.distance._strcmp95 A last analyzed 2020-12-31 20:10 UTC

Complexity

Size/Duplication

Test Coverage

Importance

2 Methods

Duplication Side-by-Side

Filter issues like

abydos.distance._strcmp95 A
last analyzed 2020-12-31 20:10 UTC