abydos.distance._jaro_winkler.dist_jaro_winkler() - Code Metrics - Inspection of "Merge pull request #248 from chrislit/0.6.0" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( c2a3b6...15a61d )

by Chris

created 2020-01-12 22:24 UTC

abydos.distance._jaro_winkler.dist_jaro_winkler() A

↳ Parent: Project

Complexity

Conditions

Size

Total Lines	78
Code Lines	16

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	3
CRAP Score	1

Importance

Changes

Metric	Value
eloc	16
dl	0
loc	78
ccs	3
cts	3
cp	1
rs	9.6
c	0
b	0
f	0
cc	1
nop	7
crap	1

How to fix Long Method

# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._jaro_winkler.

The distance._JaroWinkler module implements distance metrics based on
:cite:`Jaro:1989` and subsequent works:

    - Jaro distance
    - Jaro-Winkler distance
"""

from ._distance import _Distance
from ..tokenizer import QGrams

__all__ = ['JaroWinkler']


class JaroWinkler(_Distance):
    """Jaro-Winkler distance.

    Jaro(-Winkler) distance is a string edit distance initially proposed by
    Jaro and extended by Winkler :cite:`Jaro:1989,Winkler:1990`.

    This is Python based on the C code for strcmp95:
    http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
    :cite:`Winkler:1994`. The above file is a US Government publication and,
    accordingly, in the public domain.

    .. versionadded:: 0.3.6
    """

    def __init__(
        self,
        qval=1,
        mode='winkler',
        long_strings=False,
        boost_threshold=0.7,
        scaling_factor=0.1,
        **kwargs
    ):
        """Initialize JaroWinkler instance.

        Parameters
        ----------
        qval : int
            The length of each q-gram (defaults to 1: character-wise matching)
        mode : str
            Indicates which variant of this distance metric to compute:

                - ``winkler`` -- computes the Jaro-Winkler distance (default)
                  which increases the score for matches near the start of the
                  word
                - ``jaro`` -- computes the Jaro distance

        long_strings : bool
            Set to True to "Increase the probability of a match when the number
            of matched characters is large. This option allows for a little
            more tolerance when the strings are large. It is not an appropriate
            test when comparing fixed length fields such as phone and social
            security numbers." (Used in 'winkler' mode only.)
        boost_threshold : float
            A value between 0 and 1, below which the Winkler boost is not
            applied (defaults to 0.7). (Used in 'winkler' mode only.)
        scaling_factor : float
            A value between 0 and 0.25, indicating by how much to boost scores
            for matching prefixes (defaults to 0.1). (Used in 'winkler' mode
            only.)


        .. versionadded:: 0.4.0

        """
        super(JaroWinkler, self).__init__(**kwargs)
        self._qval = qval
        self._mode = mode
        self._long_strings = long_strings
        self._boost_threshold = boost_threshold
        self._scaling_factor = scaling_factor

    def sim(self, src, tar):
        """Return the Jaro or Jaro-Winkler similarity of two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            Jaro or Jaro-Winkler similarity

        Raises
        ------
        ValueError
            Unsupported boost_threshold assignment; boost_threshold must be
            between 0 and 1.
        ValueError
            Unsupported scaling_factor assignment; scaling_factor must be
            between 0 and 0.25.'

        Examples
        --------
        >>> cmp = JaroWinkler()
        >>> round(cmp.sim('cat', 'hat'), 12)
        0.777777777778
        >>> round(cmp.sim('Niall', 'Neil'), 12)
        0.805
        >>> round(cmp.sim('aluminum', 'Catalan'), 12)
        0.60119047619
        >>> round(cmp.sim('ATCG', 'TAGC'), 12)
        0.833333333333

        >>> cmp = JaroWinkler(mode='jaro')
        >>> round(cmp.sim('cat', 'hat'), 12)
        0.777777777778
        >>> round(cmp.sim('Niall', 'Neil'), 12)
        0.783333333333
        >>> round(cmp.sim('aluminum', 'Catalan'), 12)
        0.60119047619
        >>> round(cmp.sim('ATCG', 'TAGC'), 12)
        0.833333333333


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        if self._mode == 'winkler':
            if self._boost_threshold > 1 or self._boost_threshold < 0:
                raise ValueError(
                    'Unsupported boost_threshold assignment; '
                    + 'boost_threshold must be between 0 and 1.'
                )
            if self._scaling_factor > 0.25 or self._scaling_factor < 0:
                raise ValueError(
                    'Unsupported scaling_factor assignment; '
                    + 'scaling_factor must be between 0 and 0.25.'
                )

        if src == tar:
            return 1.0

        src = QGrams(self._qval).tokenize(src.strip()).get_list()
        tar = QGrams(self._qval).tokenize(tar.strip()).get_list()

        lens = len(src)
        lent = len(tar)

        # If either string is blank - return - added in Version 2
        if lens == 0 or lent == 0:
            return 0.0

        if lens > lent:
            search_range = lens
            minv = lent
        else:
            search_range = lent
            minv = lens

        # Zero out the flags
        src_flag = [0] * search_range
        tar_flag = [0] * search_range
        search_range = max(0, search_range // 2 - 1)

        # Looking only within the search range,
        # count and flag the matched pairs.
        num_com = 0
        yl1 = lent - 1
        for i in range(lens):
            low_lim = (i - search_range) if (i >= search_range) else 0
            hi_lim = (i + search_range) if ((i + search_range) <= yl1) else yl1
            for j in range(low_lim, hi_lim + 1):
                if (tar_flag[j] == 0) and (tar[j] == src[i]):
                    tar_flag[j] = 1
                    src_flag[i] = 1
                    num_com += 1
                    break

        # If no characters in common - return
        if num_com == 0:
            return 0.0

        # Count the number of transpositions
        k = n_trans = 0
        for i in range(lens):
            if src_flag[i] != 0:
                j = 0
                for j in range(k, lent):  # pragma: no branch
                    if tar_flag[j] != 0:
                        k = j + 1
                        break
                if src[i] != tar[j]:
                    n_trans += 1
        n_trans //= 2

        # Main weight computation for Jaro distance
        weight = (
            num_com / lens + num_com / lent + (num_com - n_trans) / num_com
        )
        weight /= 3.0

        # Continue to boost the weight if the strings are similar
        # This is the Winkler portion of Jaro-Winkler distance
        if self._mode == 'winkler' and weight > self._boost_threshold:

            # Adjust for having up to the first 4 characters in common
            j = 4 if (minv >= 4) else minv
            i = 0
            while (i < j) and (src[i] == tar[i]):
                i += 1
            weight += i * self._scaling_factor * (1.0 - weight)

            # Optionally adjust for long strings.

            # After agreeing beginning chars, at least two more must agree and
            # the agreeing characters must be > .5 of remaining characters.
            if (
                self._long_strings
                and (minv > 4)
                and (num_com > i + 1)
                and (2 * num_com >= minv + i)
            ):
                weight += (1.0 - weight) * (
                    (num_com - i - 1) / (lens + lent - i * 2 + 2)
                )

        return weight


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# Copyright 2014-2020 by Christopher C. Little.
2		# This file is part of Abydos.
3		#
4		# Abydos is free software: you can redistribute it and/or modify
5		# it under the terms of the GNU General Public License as published by
6		# the Free Software Foundation, either version 3 of the License, or
7		# (at your option) any later version.
8		#
9		# Abydos is distributed in the hope that it will be useful,
10		# but WITHOUT ANY WARRANTY; without even the implied warranty of
11		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12		# GNU General Public License for more details.
13		#
14		# You should have received a copy of the GNU General Public License
15		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17		"""abydos.distance._jaro_winkler.
18
19	1	The distance._JaroWinkler module implements distance metrics based on
20		:cite:`Jaro:1989` and subsequent works:
21
22		- Jaro distance
23		- Jaro-Winkler distance
24		"""
25
26		from ._distance import _Distance
27		from ..tokenizer import QGrams
28	1
29		__all__ = ['JaroWinkler']
30
31
32		class JaroWinkler(_Distance):
33		"""Jaro-Winkler distance.
34
35	1	Jaro(-Winkler) distance is a string edit distance initially proposed by
36		Jaro and extended by Winkler :cite:`Jaro:1989,Winkler:1990`.
37	1
38		This is Python based on the C code for strcmp95:
39	1	http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
40	1	:cite:`Winkler:1994`. The above file is a US Government publication and,
41	1	accordingly, in the public domain.
42
43	1	.. versionadded:: 0.3.6
44		"""
45
46	1	def __init__(
47		self,
48		qval=1,
49		mode='winkler',
50		long_strings=False,
51		boost_threshold=0.7,
52		scaling_factor=0.1,
53		**kwargs
54		):
55		"""Initialize JaroWinkler instance.
56
57		Parameters
58		----------
59		qval : int
60	1	The length of each q-gram (defaults to 1: character-wise matching)
61		mode : str
62		Indicates which variant of this distance metric to compute:
63
64		- ``winkler`` -- computes the Jaro-Winkler distance (default)
65		which increases the score for matches near the start of the
66		word
67		- ``jaro`` -- computes the Jaro distance
68
69		long_strings : bool
70		Set to True to "Increase the probability of a match when the number
71		of matched characters is large. This option allows for a little
72		more tolerance when the strings are large. It is not an appropriate
73		test when comparing fixed length fields such as phone and social
74		security numbers." (Used in 'winkler' mode only.)
75		boost_threshold : float
76		A value between 0 and 1, below which the Winkler boost is not
77		applied (defaults to 0.7). (Used in 'winkler' mode only.)
78		scaling_factor : float
79		A value between 0 and 0.25, indicating by how much to boost scores
80		for matching prefixes (defaults to 0.1). (Used in 'winkler' mode
81		only.)
82
83
84		.. versionadded:: 0.4.0
85
86		"""
87		super(JaroWinkler, self).__init__(**kwargs)
88		self._qval = qval
89		self._mode = mode
90		self._long_strings = long_strings
91		self._boost_threshold = boost_threshold
92		self._scaling_factor = scaling_factor
93
94		def sim(self, src, tar):
95		"""Return the Jaro or Jaro-Winkler similarity of two strings.
96
97		Parameters
98		----------
99		src : str
100		Source string for comparison
101	1	tar : str
102	1	Target string for comparison
103	1
104	1	Returns
105	1	-------
106	1	float
107		Jaro or Jaro-Winkler similarity
108	1
109		Raises
110		------
111		ValueError
112		Unsupported boost_threshold assignment; boost_threshold must be
113		between 0 and 1.
114		ValueError
115		Unsupported scaling_factor assignment; scaling_factor must be
116		between 0 and 0.25.'
117
118		Examples
119		--------
120		>>> cmp = JaroWinkler()
121		>>> round(cmp.sim('cat', 'hat'), 12)
122		0.777777777778
123		>>> round(cmp.sim('Niall', 'Neil'), 12)
124		0.805
125		>>> round(cmp.sim('aluminum', 'Catalan'), 12)
126		0.60119047619
127		>>> round(cmp.sim('ATCG', 'TAGC'), 12)
128		0.833333333333
129
130		>>> cmp = JaroWinkler(mode='jaro')
131		>>> round(cmp.sim('cat', 'hat'), 12)
132		0.777777777778
133		>>> round(cmp.sim('Niall', 'Neil'), 12)
134		0.783333333333
135		>>> round(cmp.sim('aluminum', 'Catalan'), 12)
136		0.60119047619
137		>>> round(cmp.sim('ATCG', 'TAGC'), 12)
138		0.833333333333
139
140
141		.. versionadded:: 0.1.0
142		.. versionchanged:: 0.3.6
143		Encapsulated in class
144
145		"""
146		if self._mode == 'winkler':
147		if self._boost_threshold > 1 or self._boost_threshold < 0:
148		raise ValueError(
149		'Unsupported boost_threshold assignment; '
150		+ 'boost_threshold must be between 0 and 1.'
151		)
152		if self._scaling_factor > 0.25 or self._scaling_factor < 0:
153		raise ValueError(
154		'Unsupported scaling_factor assignment; '
155		+ 'scaling_factor must be between 0 and 0.25.'
156		)
157
158	1	if src == tar:
159	1	return 1.0
160	1
161		src = QGrams(self._qval).tokenize(src.strip()).get_list()
162		tar = QGrams(self._qval).tokenize(tar.strip()).get_list()
163
164	1	lens = len(src)
165	1	lent = len(tar)
166
167		# If either string is blank - return - added in Version 2
168		if lens == 0 or lent == 0:
169		return 0.0
170	1
171	1	if lens > lent:
172		search_range = lens
173	1	minv = lent
174	1	else:
175		search_range = lent
176	1	minv = lens
177	1
178		# Zero out the flags
179		src_flag = [0] * search_range
180	1	tar_flag = [0] * search_range
181	1	search_range = max(0, search_range // 2 - 1)
182
183	1	# Looking only within the search range,
184	1	# count and flag the matched pairs.
185	1	num_com = 0
186		yl1 = lent - 1
187	1	for i in range(lens):
188	1	low_lim = (i - search_range) if (i >= search_range) else 0
189		hi_lim = (i + search_range) if ((i + search_range) <= yl1) else yl1
190		for j in range(low_lim, hi_lim + 1):
191	1	if (tar_flag[j] == 0) and (tar[j] == src[i]):
192	1	tar_flag[j] = 1
193	1	src_flag[i] = 1
194		num_com += 1
195		break
196
197	1	# If no characters in common - return
198	1	if num_com == 0:
199	1	return 0.0
200	1
201	1	# Count the number of transpositions
202	1	k = n_trans = 0
203	1	for i in range(lens):
204	1	if src_flag[i] != 0:
205	1	j = 0
206	1	for j in range(k, lent): # pragma: no branch
207	1	if tar_flag[j] != 0:
208		k = j + 1
209		break
210	1	if src[i] != tar[j]:
211	1	n_trans += 1
212		n_trans //= 2
213
214	1	# Main weight computation for Jaro distance
215	1	weight = (
216	1	num_com / lens + num_com / lent + (num_com - n_trans) / num_com
217	1	)
218	1	weight /= 3.0
219	1
220	1	# Continue to boost the weight if the strings are similar
221	1	# This is the Winkler portion of Jaro-Winkler distance
222	1	if self._mode == 'winkler' and weight > self._boost_threshold:
223	1
224	1	# Adjust for having up to the first 4 characters in common
225		j = 4 if (minv >= 4) else minv
226		i = 0
227	1	while (i < j) and (src[i] == tar[i]):
228		i += 1
229		weight += i * self._scaling_factor * (1.0 - weight)
230	1
231		# Optionally adjust for long strings.
232
233		# After agreeing beginning chars, at least two more must agree and
234	1	# the agreeing characters must be > .5 of remaining characters.
235		if (
236		self._long_strings
237	1	and (minv > 4)
238	1	and (num_com > i + 1)
239	1	and (2 * num_com >= minv + i)
240	1	):
241	1	weight += (1.0 - weight) * (
242		(num_com - i - 1) / (lens + lent - i * 2 + 2)
243		)
244
245		return weight
246
247	1
248		if __name__ == '__main__':
249		import doctest
250
251		doctest.testmod()
252

chrislit / abydos

Push — master ( c2a3b6...15a61d )

abydos.distance._jaro_winkler.dist_jaro_winkler() A

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like