abydos.distance._jaro_winkler - Code Metrics - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

abydos.distance._jaro_winkler A
last analyzed 2020-12-31 20:10 UTC

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	257
Duplicated Lines	0 %

Test Coverage

Coverage

100%

Importance

Changes

Metric	Value
wmc	32
eloc	93
dl	0
loc	257
ccs	72
cts	72
cp	1
rs	9.84
c	0
b	0
f	0

2 Methods

Rating	Name	Duplication	Size	Complexity
A	JaroWinkler.__init__()	0	47	1
F	JaroWinkler.sim()	0	155	31

# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._jaro_winkler.

The distance._JaroWinkler module implements distance metrics based on
:cite:`Jaro:1989` and subsequent works:

    - Jaro distance
    - Jaro-Winkler distance
"""

from typing import Any

from ._distance import _Distance
from ..tokenizer import QGrams

__all__ = ['JaroWinkler']


class JaroWinkler(_Distance):
    """Jaro-Winkler distance.

    Jaro(-Winkler) distance is a string edit distance initially proposed by
    Jaro and extended by Winkler :cite:`Jaro:1989,Winkler:1990`.

    This is Python based on the C code for strcmp95:
    http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
    :cite:`Winkler:1994`. The above file is a US Government publication and,
    accordingly, in the public domain.

    .. versionadded:: 0.3.6
    """

    def __init__(
        self,
        qval: int = 1,
        mode: str = 'winkler',
        long_strings: bool = False,
        boost_threshold: float = 0.7,
        scaling_factor: float = 0.1,
        **kwargs: Any
    ) -> None:
        """Initialize JaroWinkler instance.

        Parameters
        ----------
        qval : int
            The length of each q-gram (defaults to 1: character-wise matching)
        mode : str
            Indicates which variant of this distance metric to compute:

                - ``winkler`` -- computes the Jaro-Winkler distance (default)
                  which increases the score for matches near the start of the
                  word
                - ``jaro`` -- computes the Jaro distance

        long_strings : bool
            Set to True to "Increase the probability of a match when the number
            of matched characters is large. This option allows for a little
            more tolerance when the strings are large. It is not an appropriate
            test when comparing fixed length fields such as phone and social
            security numbers." (Used in 'winkler' mode only.)
        boost_threshold : float
            A value between 0 and 1, below which the Winkler boost is not
            applied (defaults to 0.7). (Used in 'winkler' mode only.)
        scaling_factor : float
            A value between 0 and 0.25, indicating by how much to boost scores
            for matching prefixes (defaults to 0.1). (Used in 'winkler' mode
            only.)


        .. versionadded:: 0.4.0

        """
        super(JaroWinkler, self).__init__(**kwargs)
        self._qval = qval
        self._mode = mode
        self._long_strings = long_strings
        self._boost_threshold = boost_threshold
        self._scaling_factor = scaling_factor

    def sim(self, src: str, tar: str) -> float:
        """Return the Jaro or Jaro-Winkler similarity of two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            Jaro or Jaro-Winkler similarity

        Raises
        ------
        ValueError
            Unsupported boost_threshold assignment; boost_threshold must be
            between 0 and 1.
        ValueError
            Unsupported scaling_factor assignment; scaling_factor must be
            between 0 and 0.25.'

        Examples
        --------
        >>> cmp = JaroWinkler()
        >>> round(cmp.sim('cat', 'hat'), 12)
        0.777777777778
        >>> round(cmp.sim('Niall', 'Neil'), 12)
        0.805
        >>> round(cmp.sim('aluminum', 'Catalan'), 12)
        0.60119047619
        >>> round(cmp.sim('ATCG', 'TAGC'), 12)
        0.833333333333

        >>> cmp = JaroWinkler(mode='jaro')
        >>> round(cmp.sim('cat', 'hat'), 12)
        0.777777777778
        >>> round(cmp.sim('Niall', 'Neil'), 12)
        0.783333333333
        >>> round(cmp.sim('aluminum', 'Catalan'), 12)
        0.60119047619
        >>> round(cmp.sim('ATCG', 'TAGC'), 12)
        0.833333333333


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        if self._mode == 'winkler':
            if self._boost_threshold > 1 or self._boost_threshold < 0:
                raise ValueError(
                    'Unsupported boost_threshold assignment; '
                    + 'boost_threshold must be between 0 and 1.'
                )
            if self._scaling_factor > 0.25 or self._scaling_factor < 0:
                raise ValueError(
                    'Unsupported scaling_factor assignment; '
                    + 'scaling_factor must be between 0 and 0.25.'
                )

        if src == tar:
            return 1.0

        tokenizer = QGrams(self._qval)
        tokenizer.tokenize(src.strip())
        src_list = tokenizer.get_list()
        tokenizer.tokenize(tar.strip())
        tar_list = tokenizer.get_list()

        lens = len(src_list)
        lent = len(tar_list)

        # If either string is blank - return - added in Version 2
        if lens == 0 or lent == 0:
            return 0.0

        if lens > lent:
            search_range = lens
            minv = lent
        else:
            search_range = lent
            minv = lens

        # Zero out the flags
        src_flag = [0] * search_range
        tar_flag = [0] * search_range
        search_range = max(0, search_range // 2 - 1)

        # Looking only within the search range,
        # count and flag the matched pairs.
        num_com = 0
        yl1 = lent - 1
        for i in range(lens):
            low_lim = (i - search_range) if (i >= search_range) else 0
            hi_lim = (i + search_range) if ((i + search_range) <= yl1) else yl1
            for j in range(low_lim, hi_lim + 1):
                if (tar_flag[j] == 0) and (tar_list[j] == src_list[i]):
                    tar_flag[j] = 1
                    src_flag[i] = 1
                    num_com += 1
                    break

        # If no characters in common - return
        if num_com == 0:
            return 0.0

        # Count the number of transpositions
        k = n_trans = 0
        for i in range(lens):
            if src_flag[i] != 0:
                j = 0
                for j in range(k, lent):  # pragma: no branch
                    if tar_flag[j] != 0:
                        k = j + 1
                        break
                if src_list[i] != tar_list[j]:
                    n_trans += 1
        n_trans //= 2

        # Main weight computation for Jaro distance
        weight = (
            num_com / lens + num_com / lent + (num_com - n_trans) / num_com
        )
        weight /= 3.0

        # Continue to boost the weight if the strings are similar
        # This is the Winkler portion of Jaro-Winkler distance
        if self._mode == 'winkler' and weight > self._boost_threshold:

            # Adjust for having up to the first 4 characters in common
            j = 4 if (minv >= 4) else minv
            i = 0
            while (i < j) and (src_list[i] == tar_list[i]):
                i += 1
            weight += i * self._scaling_factor * (1.0 - weight)

            # Optionally adjust for long strings.

            # After agreeing beginning chars, at least two more must agree and
            # the agreeing characters must be > .5 of remaining characters.
            if (
                self._long_strings
                and (minv > 4)
                and (num_com > i + 1)
                and (2 * num_com >= minv + i)
            ):
                weight += (1.0 - weight) * (
                    (num_com - i - 1) / (lens + lent - i * 2 + 2)
                )

        return weight


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# Copyright 2014-2020 by Christopher C. Little.
2		# This file is part of Abydos.
3		#
4		# Abydos is free software: you can redistribute it and/or modify
5		# it under the terms of the GNU General Public License as published by
6		# the Free Software Foundation, either version 3 of the License, or
7		# (at your option) any later version.
8		#
9		# Abydos is distributed in the hope that it will be useful,
10		# but WITHOUT ANY WARRANTY; without even the implied warranty of
11		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12		# GNU General Public License for more details.
13		#
14		# You should have received a copy of the GNU General Public License
15		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17		"""abydos.distance._jaro_winkler.
18
19	1	The distance._JaroWinkler module implements distance metrics based on
20		:cite:`Jaro:1989` and subsequent works:
21
22		- Jaro distance
23		- Jaro-Winkler distance
24		"""
25
26		from typing import Any
27
28	1	from ._distance import _Distance
29		from ..tokenizer import QGrams
30
31		__all__ = ['JaroWinkler']
32
33
34		class JaroWinkler(_Distance):
35	1	"""Jaro-Winkler distance.
36
37	1	Jaro(-Winkler) distance is a string edit distance initially proposed by
38		Jaro and extended by Winkler :cite:`Jaro:1989,Winkler:1990`.
39	1
40	1	This is Python based on the C code for strcmp95:
41	1	http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
42		:cite:`Winkler:1994`. The above file is a US Government publication and,
43	1	accordingly, in the public domain.
44
45		.. versionadded:: 0.3.6
46	1	"""
47
48		def __init__(
49		self,
50		qval: int = 1,
51		mode: str = 'winkler',
52		long_strings: bool = False,
53		boost_threshold: float = 0.7,
54		scaling_factor: float = 0.1,
55		**kwargs: Any
56		) -> None:
57		"""Initialize JaroWinkler instance.
58
59		Parameters
60	1	----------
61		qval : int
62		The length of each q-gram (defaults to 1: character-wise matching)
63		mode : str
64		Indicates which variant of this distance metric to compute:
65
66		- ``winkler`` -- computes the Jaro-Winkler distance (default)
67		which increases the score for matches near the start of the
68		word
69		- ``jaro`` -- computes the Jaro distance
70
71		long_strings : bool
72		Set to True to "Increase the probability of a match when the number
73		of matched characters is large. This option allows for a little
74		more tolerance when the strings are large. It is not an appropriate
75		test when comparing fixed length fields such as phone and social
76		security numbers." (Used in 'winkler' mode only.)
77		boost_threshold : float
78		A value between 0 and 1, below which the Winkler boost is not
79		applied (defaults to 0.7). (Used in 'winkler' mode only.)
80		scaling_factor : float
81		A value between 0 and 0.25, indicating by how much to boost scores
82		for matching prefixes (defaults to 0.1). (Used in 'winkler' mode
83		only.)
84
85
86		.. versionadded:: 0.4.0
87
88		"""
89		super(JaroWinkler, self).__init__(**kwargs)
90		self._qval = qval
91		self._mode = mode
92		self._long_strings = long_strings
93		self._boost_threshold = boost_threshold
94		self._scaling_factor = scaling_factor
95
96		def sim(self, src: str, tar: str) -> float:
97		"""Return the Jaro or Jaro-Winkler similarity of two strings.
98
99		Parameters
100		----------
101	1	src : str
102	1	Source string for comparison
103	1	tar : str
104	1	Target string for comparison
105	1
106	1	Returns
107		-------
108	1	float
109		Jaro or Jaro-Winkler similarity
110
111		Raises
112		------
113		ValueError
114		Unsupported boost_threshold assignment; boost_threshold must be
115		between 0 and 1.
116		ValueError
117		Unsupported scaling_factor assignment; scaling_factor must be
118		between 0 and 0.25.'
119
120		Examples
121		--------
122		>>> cmp = JaroWinkler()
123		>>> round(cmp.sim('cat', 'hat'), 12)
124		0.777777777778
125		>>> round(cmp.sim('Niall', 'Neil'), 12)
126		0.805
127		>>> round(cmp.sim('aluminum', 'Catalan'), 12)
128		0.60119047619
129		>>> round(cmp.sim('ATCG', 'TAGC'), 12)
130		0.833333333333
131
132		>>> cmp = JaroWinkler(mode='jaro')
133		>>> round(cmp.sim('cat', 'hat'), 12)
134		0.777777777778
135		>>> round(cmp.sim('Niall', 'Neil'), 12)
136		0.783333333333
137		>>> round(cmp.sim('aluminum', 'Catalan'), 12)
138		0.60119047619
139		>>> round(cmp.sim('ATCG', 'TAGC'), 12)
140		0.833333333333
141
142
143		.. versionadded:: 0.1.0
144		.. versionchanged:: 0.3.6
145		Encapsulated in class
146
147		"""
148		if self._mode == 'winkler':
149		if self._boost_threshold > 1 or self._boost_threshold < 0:
150		raise ValueError(
151		'Unsupported boost_threshold assignment; '
152		+ 'boost_threshold must be between 0 and 1.'
153		)
154		if self._scaling_factor > 0.25 or self._scaling_factor < 0:
155		raise ValueError(
156		'Unsupported scaling_factor assignment; '
157		+ 'scaling_factor must be between 0 and 0.25.'
158	1	)
159	1
160	1	if src == tar:
161		return 1.0
162
163		tokenizer = QGrams(self._qval)
164	1	tokenizer.tokenize(src.strip())
165	1	src_list = tokenizer.get_list()
166		tokenizer.tokenize(tar.strip())
167		tar_list = tokenizer.get_list()
168
169		lens = len(src_list)
170	1	lent = len(tar_list)
171	1
172		# If either string is blank - return - added in Version 2
173	1	if lens == 0 or lent == 0:
174	1	return 0.0
175
176	1	if lens > lent:
177	1	search_range = lens
178		minv = lent
179		else:
180	1	search_range = lent
181	1	minv = lens
182
183	1	# Zero out the flags
184	1	src_flag = [0] * search_range
185	1	tar_flag = [0] * search_range
186		search_range = max(0, search_range // 2 - 1)
187	1
188	1	# Looking only within the search range,
189		# count and flag the matched pairs.
190		num_com = 0
191	1	yl1 = lent - 1
192	1	for i in range(lens):
193	1	low_lim = (i - search_range) if (i >= search_range) else 0
194		hi_lim = (i + search_range) if ((i + search_range) <= yl1) else yl1
195		for j in range(low_lim, hi_lim + 1):
196		if (tar_flag[j] == 0) and (tar_list[j] == src_list[i]):
197	1	tar_flag[j] = 1
198	1	src_flag[i] = 1
199	1	num_com += 1
200	1	break
201	1
202	1	# If no characters in common - return
203	1	if num_com == 0:
204	1	return 0.0
205	1
206	1	# Count the number of transpositions
207	1	k = n_trans = 0
208		for i in range(lens):
209		if src_flag[i] != 0:
210	1	j = 0
211	1	for j in range(k, lent): # pragma: no branch
212		if tar_flag[j] != 0:
213		k = j + 1
214	1	break
215	1	if src_list[i] != tar_list[j]:
216	1	n_trans += 1
217	1	n_trans //= 2
218	1
219	1	# Main weight computation for Jaro distance
220	1	weight = (
221	1	num_com / lens + num_com / lent + (num_com - n_trans) / num_com
222	1	)
223	1	weight /= 3.0
224	1
225		# Continue to boost the weight if the strings are similar
226		# This is the Winkler portion of Jaro-Winkler distance
227	1	if self._mode == 'winkler' and weight > self._boost_threshold:
228
229		# Adjust for having up to the first 4 characters in common
230	1	j = 4 if (minv >= 4) else minv
231		i = 0
232		while (i < j) and (src_list[i] == tar_list[i]):
233		i += 1
234	1	weight += i * self._scaling_factor * (1.0 - weight)
235
236		# Optionally adjust for long strings.
237	1
238	1	# After agreeing beginning chars, at least two more must agree and
239	1	# the agreeing characters must be > .5 of remaining characters.
240	1	if (
241	1	self._long_strings
242		and (minv > 4)
243		and (num_com > i + 1)
244		and (2 * num_com >= minv + i)
245		):
246		weight += (1.0 - weight) * (
247	1	(num_com - i - 1) / (lens + lent - i * 2 + 2)
248		)
249
250		return weight
251
252
253	1	if __name__ == '__main__':
254		import doctest
255
256		doctest.testmod()
257

chrislit / abydos

abydos.distance._jaro_winkler A last analyzed 2020-12-31 20:10 UTC

Complexity

Size/Duplication

Test Coverage

Importance

2 Methods

Duplication Side-by-Side

Filter issues like

abydos.distance._jaro_winkler A
last analyzed 2020-12-31 20:10 UTC