abydos.distance._jaro_winkler.sim_jaro_winkler() - Code Metrics - Inspection of "0.3.6" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#141)

by Chris

created 2018-11-10 03:25 UTC

abydos.distance._jaro_winkler.sim_jaro_winkler() A

↳ Parent: abydos.distance._jaro_winkler

Complexity

Conditions

Size

Total Lines	62
Code Lines	10

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	2
CRAP Score	1

Importance

Changes

Metric	Value
cc	1
eloc	10
nop	7
dl	0
loc	62
ccs	2
cts	2
cp	1
crap	1
rs	9.9
c	0
b	0
f	0

How to fix Long Method

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._jaro_winkler.

The distance._JaroWinkler module implements distance metrics based on
:cite:`Jaro:1989` and subsequent works:

    - Jaro distance
    - Jaro-Winkler distance
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from six.moves import range

from ._distance import _Distance
from ..tokenizer import QGrams

__all__ = ['JaroWinkler', 'dist_jaro_winkler', 'sim_jaro_winkler']


class JaroWinkler(_Distance):

    """Jaro-Winkler distance.

    Jaro(-Winkler) distance is a string edit distance initially proposed by
    Jaro and extended by Winkler :cite:`Jaro:1989,Winkler:1990`.

    This is Python based on the C code for strcmp95:
    http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
    :cite:`Winkler:1994`. The above file is a US Government publication and,
    accordingly, in the public domain.
    """

    def sim(

        self,

        src,

        tar,

        qval=1,

        mode='winkler',

        long_strings=False,

        boost_threshold=0.7,

        scaling_factor=0.1,

    ):
        """Return the Jaro or Jaro-Winkler similarity of two strings.

        Args:
            src (str): Source string for comparison
            tar (str): Target string for comparison
            qval (int): The length of each q-gram (defaults to 1:
                character-wise matching)
            mode (str): Indicates which variant of this distance metric to
                compute:
                    - ``winkler`` -- computes the Jaro-Winkler distance
                      (default) which increases the score for matches near the
                      start of the word
                    - ``jaro`` -- computes the Jaro distance
            long_strings (bool): Set to True to "Increase the probability of a
                match when the number of matched characters is large. This
                option allows for a little more tolerance when the strings are
                large. It is not an appropriate test when comparing fixed
                length fields such as phone and social security numbers."
                (Used in 'winkler' mode only.)
            boost_threshold (float): A value between 0 and 1, below which the
                Winkler boost is not applied (defaults to 0.7). (Used in
                'winkler' mode only.)
            scaling_factor (float): A value between 0 and 0.25, indicating by
                how much to boost scores for matching prefixes (defaults to
                0.1). (Used in 'winkler' mode only.)

        Returns:
            float: Jaro or Jaro-Winkler similarity

        Raises:
            ValueError: Unsupported boost_threshold assignment; boost_threshold
                must be between 0 and 1.
            ValueError: Unsupported scaling_factor assignment; scaling_factor
                must be between 0 and 0.25.'

        Examples:
            >>> round(sim_jaro_winkler('cat', 'hat'), 12)
            0.777777777778
            >>> round(sim_jaro_winkler('Niall', 'Neil'), 12)
            0.805
            >>> round(sim_jaro_winkler('aluminum', 'Catalan'), 12)
            0.60119047619
            >>> round(sim_jaro_winkler('ATCG', 'TAGC'), 12)
            0.833333333333

            >>> round(sim_jaro_winkler('cat', 'hat', mode='jaro'), 12)
            0.777777777778
            >>> round(sim_jaro_winkler('Niall', 'Neil', mode='jaro'), 12)
            0.783333333333
            >>> round(sim_jaro_winkler('aluminum', 'Catalan', mode='jaro'), 12)
            0.60119047619
            >>> round(sim_jaro_winkler('ATCG', 'TAGC', mode='jaro'), 12)
            0.833333333333

        """
        if mode == 'winkler':
            if boost_threshold > 1 or boost_threshold < 0:
                raise ValueError(
                    'Unsupported boost_threshold assignment; '
                    + 'boost_threshold must be between 0 and 1.'
                )
            if scaling_factor > 0.25 or scaling_factor < 0:
                raise ValueError(
                    'Unsupported scaling_factor assignment; '
                    + 'scaling_factor must be between 0 and 0.25.'
                )

        if src == tar:
            return 1.0

        src = QGrams(src.strip(), qval).ordered_list
        tar = QGrams(tar.strip(), qval).ordered_list

        lens = len(src)
        lent = len(tar)

        # If either string is blank - return - added in Version 2
        if lens == 0 or lent == 0:
            return 0.0

        if lens > lent:
            search_range = lens
            minv = lent
        else:
            search_range = lent
            minv = lens

        # Zero out the flags
        src_flag = [0] * search_range
        tar_flag = [0] * search_range
        search_range = max(0, search_range // 2 - 1)

        # Looking only within the search range,
        # count and flag the matched pairs.
        num_com = 0
        yl1 = lent - 1
        for i in range(lens):
            low_lim = (i - search_range) if (i >= search_range) else 0
            hi_lim = (i + search_range) if ((i + search_range) <= yl1) else yl1
            for j in range(low_lim, hi_lim + 1):
                if (tar_flag[j] == 0) and (tar[j] == src[i]):
                    tar_flag[j] = 1
                    src_flag[i] = 1
                    num_com += 1
                    break

        # If no characters in common - return
        if num_com == 0:
            return 0.0

        # Count the number of transpositions
        k = n_trans = 0
        for i in range(lens):
            if src_flag[i] != 0:
                j = 0
                for j in range(k, lent):  # pragma: no branch
                    if tar_flag[j] != 0:
                        k = j + 1
                        break
                if src[i] != tar[j]:
                    n_trans += 1
        n_trans //= 2

        # Main weight computation for Jaro distance
        weight = (
            num_com / lens + num_com / lent + (num_com - n_trans) / num_com
        )
        weight /= 3.0

        # Continue to boost the weight if the strings are similar
        # This is the Winkler portion of Jaro-Winkler distance
        if mode == 'winkler' and weight > boost_threshold:

            # Adjust for having up to the first 4 characters in common
            j = 4 if (minv >= 4) else minv
            i = 0
            while (i < j) and (src[i] == tar[i]):
                i += 1
            weight += i * scaling_factor * (1.0 - weight)

            # Optionally adjust for long strings.

            # After agreeing beginning chars, at least two more must agree and
            # the agreeing characters must be > .5 of remaining characters.
            if (
                long_strings

                and (minv > 4)

                and (num_com > i + 1)

                and (2 * num_com >= minv + i)

            ):
                weight += (1.0 - weight) * (
                    (num_com - i - 1) / (lens + lent - i * 2 + 2)
                )

        return weight


def sim_jaro_winkler(

    src,

    tar,

    qval=1,

    mode='winkler',

    long_strings=False,

    boost_threshold=0.7,

    scaling_factor=0.1,

):
    """Return the Jaro or Jaro-Winkler similarity of two strings.

    This is a wrapper for :py:meth:`JaroWinkler.sim`.

    Args:
        src (str): Source string for comparison
        tar (str): Target string for comparison
        qval (int): The length of each q-gram (defaults to 1:
            character-wise matching)
        mode (str): Indicates which variant of this distance metric to
            compute:
                - ``winkler`` -- computes the Jaro-Winkler distance (default)
                  which increases the score for matches near the start of the
                  word
                - ``jaro`` -- computes the Jaro distance
        long_strings (bool): Set to True to "Increase the probability of a
            match when the number of matched characters is large. This option
            allows for a little more tolerance when the strings are large. It
            is not an appropriate test when comparing fixedlength fields such
            as phone and social security numbers." (Used in 'winkler' mode
            only.)
        boost_threshold (float): A value between 0 and 1, below which the
            Winkler boost is not applied (defaults to 0.7). (Used in 'winkler'
            mode only.)
        scaling_factor (float): A value between 0 and 0.25, indicating by how
            much to boost scores for matching prefixes (defaults to 0.1). (Used
            in 'winkler' mode only.)

    Returns:
        float: Jaro or Jaro-Winkler similarity

    Examples:
        >>> round(sim_jaro_winkler('cat', 'hat'), 12)
        0.777777777778
        >>> round(sim_jaro_winkler('Niall', 'Neil'), 12)
        0.805
        >>> round(sim_jaro_winkler('aluminum', 'Catalan'), 12)
        0.60119047619
        >>> round(sim_jaro_winkler('ATCG', 'TAGC'), 12)
        0.833333333333

        >>> round(sim_jaro_winkler('cat', 'hat', mode='jaro'), 12)
        0.777777777778
        >>> round(sim_jaro_winkler('Niall', 'Neil', mode='jaro'), 12)
        0.783333333333
        >>> round(sim_jaro_winkler('aluminum', 'Catalan', mode='jaro'), 12)
        0.60119047619
        >>> round(sim_jaro_winkler('ATCG', 'TAGC', mode='jaro'), 12)
        0.833333333333

    """
    return JaroWinkler().sim(
        src, tar, qval, mode, long_strings, boost_threshold, scaling_factor
    )


def dist_jaro_winkler(

    src,

    tar,

    qval=1,

    mode='winkler',

    long_strings=False,

    boost_threshold=0.7,

    scaling_factor=0.1,

):
    """Return the Jaro or Jaro-Winkler distance between two strings.

    This is a wrapper for :py:meth:`JaroWinkler.dist`.

    Args:
        src (str): Source string for comparison
        tar (str): Target string for comparison
        qval (int): The length of each q-gram (defaults to 1:
            character-wise matching)
        mode (str): Indicates which variant of this distance metric to
            compute:
                - ``winkler`` -- computes the Jaro-Winkler distance (default)
                  which increases the score for matches near the start of the
                  word
                - ``jaro`` -- computes the Jaro distance
        long_strings (bool): Set to True to "Increase the probability of a
            match when the number of matched characters is large. This option
            allows for a little more tolerance when the strings are large. It
            is not an appropriate test when comparing fixedlength fields such
            as phone and social security numbers." (Used in 'winkler' mode
            only.)
        boost_threshold (float): A value between 0 and 1, below which the
            Winkler boost is not applied (defaults to 0.7). (Used in 'winkler'
            mode only.)
        scaling_factor (float): A value between 0 and 0.25, indicating by how
            much to boost scores for matching prefixes (defaults to 0.1). (Used
            in 'winkler' mode only.)

    Returns:
        float: Jaro or Jaro-Winkler distance

    Examples:
        >>> round(dist_jaro_winkler('cat', 'hat'), 12)
        0.222222222222
        >>> round(dist_jaro_winkler('Niall', 'Neil'), 12)
        0.195
        >>> round(dist_jaro_winkler('aluminum', 'Catalan'), 12)
        0.39880952381
        >>> round(dist_jaro_winkler('ATCG', 'TAGC'), 12)
        0.166666666667

        >>> round(dist_jaro_winkler('cat', 'hat', mode='jaro'), 12)
        0.222222222222
        >>> round(dist_jaro_winkler('Niall', 'Neil', mode='jaro'), 12)
        0.216666666667
        >>> round(dist_jaro_winkler('aluminum', 'Catalan', mode='jaro'), 12)
        0.39880952381
        >>> round(dist_jaro_winkler('ATCG', 'TAGC', mode='jaro'), 12)
        0.166666666667

    """
    return JaroWinkler().dist(
        src, tar, qval, mode, long_strings, boost_threshold, scaling_factor
    )


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2014-2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.distance._jaro_winkler.
20
21		The distance._JaroWinkler module implements distance metrics based on
22		:cite:`Jaro:1989` and subsequent works:
23
24		- Jaro distance
25		- Jaro-Winkler distance
26		"""
27
28	1	from __future__ import (
29		absolute_import,
30		division,
31		print_function,
32		unicode_literals,
33		)
34
35	1	from six.moves import range
36
37	1	from ._distance import _Distance
38	1	from ..tokenizer import QGrams
39
40	1	__all__ = ['JaroWinkler', 'dist_jaro_winkler', 'sim_jaro_winkler']
41
42
43	1	class JaroWinkler(_Distance):
		0 ignored issues – show Unused Code introduced 2018-11-10 01:42 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
44		"""Jaro-Winkler distance.
45
46		Jaro(-Winkler) distance is a string edit distance initially proposed by
47		Jaro and extended by Winkler :cite:`Jaro:1989,Winkler:1990`.
48
49		This is Python based on the C code for strcmp95:
50		http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
51		:cite:`Winkler:1994`. The above file is a US Government publication and,
52		accordingly, in the public domain.
53		"""
54
55	1	def sim(
		0 ignored issues – show best-practice introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Too many arguments (8/5) Loading history... Comprehensibility introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report This function exceeds the maximum number of variables (24/15). Loading history... Bug introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'sim' method Loading history...
56		self,
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
57		src,
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
58		tar,
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
59		qval=1,
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
60		mode='winkler',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
61		long_strings=False,
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
62		boost_threshold=0.7,
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
63		scaling_factor=0.1,
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
64		):
65		"""Return the Jaro or Jaro-Winkler similarity of two strings.
66
67		Args:
68		src (str): Source string for comparison
69		tar (str): Target string for comparison
70		qval (int): The length of each q-gram (defaults to 1:
71		character-wise matching)
72		mode (str): Indicates which variant of this distance metric to
73		compute:
74		- ``winkler`` -- computes the Jaro-Winkler distance
75		(default) which increases the score for matches near the
76		start of the word
77		- ``jaro`` -- computes the Jaro distance
78		long_strings (bool): Set to True to "Increase the probability of a
79		match when the number of matched characters is large. This
80		option allows for a little more tolerance when the strings are
81		large. It is not an appropriate test when comparing fixed
82		length fields such as phone and social security numbers."
83		(Used in 'winkler' mode only.)
84		boost_threshold (float): A value between 0 and 1, below which the
85		Winkler boost is not applied (defaults to 0.7). (Used in
86		'winkler' mode only.)
87		scaling_factor (float): A value between 0 and 0.25, indicating by
88		how much to boost scores for matching prefixes (defaults to
89		0.1). (Used in 'winkler' mode only.)
90
91		Returns:
92		float: Jaro or Jaro-Winkler similarity
93
94		Raises:
95		ValueError: Unsupported boost_threshold assignment; boost_threshold
96		must be between 0 and 1.
97		ValueError: Unsupported scaling_factor assignment; scaling_factor
98		must be between 0 and 0.25.'
99
100		Examples:
101		>>> round(sim_jaro_winkler('cat', 'hat'), 12)
102		0.777777777778
103		>>> round(sim_jaro_winkler('Niall', 'Neil'), 12)
104		0.805
105		>>> round(sim_jaro_winkler('aluminum', 'Catalan'), 12)
106		0.60119047619
107		>>> round(sim_jaro_winkler('ATCG', 'TAGC'), 12)
108		0.833333333333
109
110		>>> round(sim_jaro_winkler('cat', 'hat', mode='jaro'), 12)
111		0.777777777778
112		>>> round(sim_jaro_winkler('Niall', 'Neil', mode='jaro'), 12)
113		0.783333333333
114		>>> round(sim_jaro_winkler('aluminum', 'Catalan', mode='jaro'), 12)
115		0.60119047619
116		>>> round(sim_jaro_winkler('ATCG', 'TAGC', mode='jaro'), 12)
117		0.833333333333
118
119		"""
120	1	if mode == 'winkler':
121	1	if boost_threshold > 1 or boost_threshold < 0:
122	1	raise ValueError(
123		'Unsupported boost_threshold assignment; '
124		+ 'boost_threshold must be between 0 and 1.'
125		)
126	1	if scaling_factor > 0.25 or scaling_factor < 0:
127	1	raise ValueError(
128		'Unsupported scaling_factor assignment; '
129		+ 'scaling_factor must be between 0 and 0.25.'
130		)
131
132	1	if src == tar:
133	1	return 1.0
134
135	1	src = QGrams(src.strip(), qval).ordered_list
136	1	tar = QGrams(tar.strip(), qval).ordered_list
137
138	1	lens = len(src)
139	1	lent = len(tar)
140
141		# If either string is blank - return - added in Version 2
142	1	if lens == 0 or lent == 0:
143	1	return 0.0
144
145	1	if lens > lent:
146	1	search_range = lens
147	1	minv = lent
148		else:
149	1	search_range = lent
150	1	minv = lens
151
152		# Zero out the flags
153	1	src_flag = [0] * search_range
154	1	tar_flag = [0] * search_range
155	1	search_range = max(0, search_range // 2 - 1)
156
157		# Looking only within the search range,
158		# count and flag the matched pairs.
159	1	num_com = 0
160	1	yl1 = lent - 1
161	1	for i in range(lens):
162	1	low_lim = (i - search_range) if (i >= search_range) else 0
163	1	hi_lim = (i + search_range) if ((i + search_range) <= yl1) else yl1
164	1	for j in range(low_lim, hi_lim + 1):
165	1	if (tar_flag[j] == 0) and (tar[j] == src[i]):
166	1	tar_flag[j] = 1
167	1	src_flag[i] = 1
168	1	num_com += 1
169	1	break
170
171		# If no characters in common - return
172	1	if num_com == 0:
173	1	return 0.0
174
175		# Count the number of transpositions
176	1	k = n_trans = 0
177	1	for i in range(lens):
178	1	if src_flag[i] != 0:
179	1	j = 0
180	1	for j in range(k, lent): # pragma: no branch
181	1	if tar_flag[j] != 0:
182	1	k = j + 1
183	1	break
184	1	if src[i] != tar[j]:
185	1	n_trans += 1
186	1	n_trans //= 2
187
188		# Main weight computation for Jaro distance
189	1	weight = (
190		num_com / lens + num_com / lent + (num_com - n_trans) / num_com
191		)
192	1	weight /= 3.0
193
194		# Continue to boost the weight if the strings are similar
195		# This is the Winkler portion of Jaro-Winkler distance
196	1	if mode == 'winkler' and weight > boost_threshold:
197
198		# Adjust for having up to the first 4 characters in common
199	1	j = 4 if (minv >= 4) else minv
200	1	i = 0
201	1	while (i < j) and (src[i] == tar[i]):
202	1	i += 1
203	1	weight += i * scaling_factor * (1.0 - weight)
204
205		# Optionally adjust for long strings.
206
207		# After agreeing beginning chars, at least two more must agree and
208		# the agreeing characters must be > .5 of remaining characters.
209	1	if (
210		long_strings
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
211		and (minv > 4)
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
212		and (num_com > i + 1)
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
213		and (2 * num_com >= minv + i)
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
214		):
215	1	weight += (1.0 - weight) * (
216		(num_com - i - 1) / (lens + lent - i * 2 + 2)
217		)
218
219	1	return weight
220
221
222	1	def sim_jaro_winkler(
		0 ignored issues – show best-practice introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Too many arguments (7/5) Loading history...
223		src,
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
224		tar,
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
225		qval=1,
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
226		mode='winkler',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
227		long_strings=False,
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
228		boost_threshold=0.7,
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
229		scaling_factor=0.1,
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
230		):
231		"""Return the Jaro or Jaro-Winkler similarity of two strings.
232
233		This is a wrapper for :py:meth:`JaroWinkler.sim`.
234
235		Args:
236		src (str): Source string for comparison
237		tar (str): Target string for comparison
238		qval (int): The length of each q-gram (defaults to 1:
239		character-wise matching)
240		mode (str): Indicates which variant of this distance metric to
241		compute:
242		- ``winkler`` -- computes the Jaro-Winkler distance (default)
243		which increases the score for matches near the start of the
244		word
245		- ``jaro`` -- computes the Jaro distance
246		long_strings (bool): Set to True to "Increase the probability of a
247		match when the number of matched characters is large. This option
248		allows for a little more tolerance when the strings are large. It
249		is not an appropriate test when comparing fixedlength fields such
250		as phone and social security numbers." (Used in 'winkler' mode
251		only.)
252		boost_threshold (float): A value between 0 and 1, below which the
253		Winkler boost is not applied (defaults to 0.7). (Used in 'winkler'
254		mode only.)
255		scaling_factor (float): A value between 0 and 0.25, indicating by how
256		much to boost scores for matching prefixes (defaults to 0.1). (Used
257		in 'winkler' mode only.)
258
259		Returns:
260		float: Jaro or Jaro-Winkler similarity
261
262		Examples:
263		>>> round(sim_jaro_winkler('cat', 'hat'), 12)
264		0.777777777778
265		>>> round(sim_jaro_winkler('Niall', 'Neil'), 12)
266		0.805
267		>>> round(sim_jaro_winkler('aluminum', 'Catalan'), 12)
268		0.60119047619
269		>>> round(sim_jaro_winkler('ATCG', 'TAGC'), 12)
270		0.833333333333
271
272		>>> round(sim_jaro_winkler('cat', 'hat', mode='jaro'), 12)
273		0.777777777778
274		>>> round(sim_jaro_winkler('Niall', 'Neil', mode='jaro'), 12)
275		0.783333333333
276		>>> round(sim_jaro_winkler('aluminum', 'Catalan', mode='jaro'), 12)
277		0.60119047619
278		>>> round(sim_jaro_winkler('ATCG', 'TAGC', mode='jaro'), 12)
279		0.833333333333
280
281		"""
282	1	return JaroWinkler().sim(
283		src, tar, qval, mode, long_strings, boost_threshold, scaling_factor
284		)
285
286
287	1	def dist_jaro_winkler(
		0 ignored issues – show best-practice introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Too many arguments (7/5) Loading history...
288		src,
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
289		tar,
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
290		qval=1,
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
291		mode='winkler',
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
292		long_strings=False,
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
293		boost_threshold=0.7,
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
294		scaling_factor=0.1,
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
295		):
296		"""Return the Jaro or Jaro-Winkler distance between two strings.
297
298		This is a wrapper for :py:meth:`JaroWinkler.dist`.
299
300		Args:
301		src (str): Source string for comparison
302		tar (str): Target string for comparison
303		qval (int): The length of each q-gram (defaults to 1:
304		character-wise matching)
305		mode (str): Indicates which variant of this distance metric to
306		compute:
307		- ``winkler`` -- computes the Jaro-Winkler distance (default)
308		which increases the score for matches near the start of the
309		word
310		- ``jaro`` -- computes the Jaro distance
311		long_strings (bool): Set to True to "Increase the probability of a
312		match when the number of matched characters is large. This option
313		allows for a little more tolerance when the strings are large. It
314		is not an appropriate test when comparing fixedlength fields such
315		as phone and social security numbers." (Used in 'winkler' mode
316		only.)
317		boost_threshold (float): A value between 0 and 1, below which the
318		Winkler boost is not applied (defaults to 0.7). (Used in 'winkler'
319		mode only.)
320		scaling_factor (float): A value between 0 and 0.25, indicating by how
321		much to boost scores for matching prefixes (defaults to 0.1). (Used
322		in 'winkler' mode only.)
323
324		Returns:
325		float: Jaro or Jaro-Winkler distance
326
327		Examples:
328		>>> round(dist_jaro_winkler('cat', 'hat'), 12)
329		0.222222222222
330		>>> round(dist_jaro_winkler('Niall', 'Neil'), 12)
331		0.195
332		>>> round(dist_jaro_winkler('aluminum', 'Catalan'), 12)
333		0.39880952381
334		>>> round(dist_jaro_winkler('ATCG', 'TAGC'), 12)
335		0.166666666667
336
337		>>> round(dist_jaro_winkler('cat', 'hat', mode='jaro'), 12)
338		0.222222222222
339		>>> round(dist_jaro_winkler('Niall', 'Neil', mode='jaro'), 12)
340		0.216666666667
341		>>> round(dist_jaro_winkler('aluminum', 'Catalan', mode='jaro'), 12)
342		0.39880952381
343		>>> round(dist_jaro_winkler('ATCG', 'TAGC', mode='jaro'), 12)
344		0.166666666667
345
346		"""
347	1	return JaroWinkler().dist(
348		src, tar, qval, mode, long_strings, boost_threshold, scaling_factor
349		)
350
351
352		if __name__ == '__main__':
353		import doctest
354
355		doctest.testmod()
356

chrislit / abydos

Pull Request — master (#141)

abydos.distance._jaro_winkler.sim_jaro_winkler() A

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like