abydos.distance.jaro.sim_strcmp95() - Code Metrics - Inspection of "Modularize" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#120)

by Chris

created 2018-10-19 22:19 UTC

abydos.distance.jaro.sim_strcmp95() F

↳ Parent: abydos.distance.jaro

Complexity

Conditions

Size

Total Lines	146
Code Lines	79

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	37
eloc	79
nop	3
dl	0
loc	146
rs	0
c	0
b	0
f	0

How to fix Long Method Complexity

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance.jaro.

The distance.jaro module implements distance metrics based on
:cite:`Jaro:1989` and subsequent works:

    - Jaro distance
    - Jaro-Winkler distance
    - the strcmp95 algorithm variant of Jaro-Winkler distance
"""

from __future__ import division, unicode_literals

from collections import defaultdict

from six.moves import range

from ..tokenizer.qgram import QGrams


__all__ = ['dist_jaro_winkler', 'dist_strcmp95', 'sim_jaro_winkler',
           'sim_strcmp95']


def sim_strcmp95(src, tar, long_strings=False):
    """Return the strcmp95 similarity of two strings.

    This is a Python translation of the C code for strcmp95:
    http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
    :cite:`Winkler:1994`.
    The above file is a US Government publication and, accordingly,
    in the public domain.

    This is based on the Jaro-Winkler distance, but also attempts to correct
    for some common typos and frequently confused characters. It is also
    limited to uppercase ASCII characters, so it is appropriate to American
    names, but not much else.

    :param str src: source string for comparison
    :param str tar: target string for comparison
    :param bool long_strings: set to True to "Increase the probability of a
        match when the number of matched characters is large.  This option
        allows for a little more tolerance when the strings are large. It is
        not an appropriate test when comparing fixed length fields such as
        phone and social security numbers."
    :returns: strcmp95 similarity
    :rtype: float

    >>> sim_strcmp95('cat', 'hat')
    0.7777777777777777
    >>> sim_strcmp95('Niall', 'Neil')
    0.8454999999999999
    >>> sim_strcmp95('aluminum', 'Catalan')
    0.6547619047619048
    >>> sim_strcmp95('ATCG', 'TAGC')
    0.8333333333333334
    """
    def _in_range(char):
        """Return True if char is in the range (0, 91)."""
        return 91 > ord(char) > 0

    ying = src.strip().upper()
    yang = tar.strip().upper()

    if ying == yang:
        return 1.0
    # If either string is blank - return - added in Version 2
    if not ying or not yang:
        return 0.0

    adjwt = defaultdict(int)
    sp_mx = (
        ('A', 'E'), ('A', 'I'), ('A', 'O'), ('A', 'U'), ('B', 'V'), ('E', 'I'),
        ('E', 'O'), ('E', 'U'), ('I', 'O'), ('I', 'U'), ('O', 'U'), ('I', 'Y'),
        ('E', 'Y'), ('C', 'G'), ('E', 'F'), ('W', 'U'), ('W', 'V'), ('X', 'K'),
        ('S', 'Z'), ('X', 'S'), ('Q', 'C'), ('U', 'V'), ('M', 'N'), ('L', 'I'),
        ('Q', 'O'), ('P', 'R'), ('I', 'J'), ('2', 'Z'), ('5', 'S'), ('8', 'B'),
        ('1', 'I'), ('1', 'L'), ('0', 'O'), ('0', 'Q'), ('C', 'K'), ('G', 'J')
    )

    # Initialize the adjwt array on the first call to the function only.
    # The adjwt array is used to give partial credit for characters that
    # may be errors due to known phonetic or character recognition errors.
    # A typical example is to match the letter "O" with the number "0"
    for i in sp_mx:
        adjwt[(i[0], i[1])] = 3
        adjwt[(i[1], i[0])] = 3

    if len(ying) > len(yang):
        search_range = len(ying)
        minv = len(yang)
    else:
        search_range = len(yang)
        minv = len(ying)

    # Blank out the flags
    ying_flag = [0] * search_range
    yang_flag = [0] * search_range
    search_range = max(0, search_range // 2 - 1)

    # Looking only within the search range, count and flag the matched pairs.
    num_com = 0
    yl1 = len(yang) - 1
    for i in range(len(ying)):
        low_lim = (i - search_range) if (i >= search_range) else 0
        hi_lim = (i + search_range) if ((i + search_range) <= yl1) else yl1
        for j in range(low_lim, hi_lim+1):
            if (yang_flag[j] == 0) and (yang[j] == ying[i]):
                yang_flag[j] = 1
                ying_flag[i] = 1
                num_com += 1
                break

    # If no characters in common - return
    if num_com == 0:
        return 0.0

    # Count the number of transpositions
    k = n_trans = 0
    for i in range(len(ying)):
        if ying_flag[i] != 0:
            j = 0
            for j in range(k, len(yang)):  # pragma: no branch
                if yang_flag[j] != 0:
                    k = j + 1
                    break
            if ying[i] != yang[j]:
                n_trans += 1
    n_trans //= 2

    # Adjust for similarities in unmatched characters
    n_simi = 0
    if minv > num_com:
        for i in range(len(ying)):
            if ying_flag[i] == 0 and _in_range(ying[i]):
                for j in range(len(yang)):
                    if yang_flag[j] == 0 and _in_range(yang[j]):
                        if (ying[i], yang[j]) in adjwt:
                            n_simi += adjwt[(ying[i], yang[j])]
                            yang_flag[j] = 2
                            break
    num_sim = n_simi/10.0 + num_com

    # Main weight computation
    weight = num_sim / len(ying) + num_sim / len(yang) + \
        (num_com - n_trans) / num_com
    weight /= 3.0

    # Continue to boost the weight if the strings are similar
    if weight > 0.7:

        # Adjust for having up to the first 4 characters in common
        j = 4 if (minv >= 4) else minv
        i = 0
        while (i < j) and (ying[i] == yang[i]) and (not ying[i].isdigit()):
            i += 1
        if i:
            weight += i * 0.1 * (1.0 - weight)

        # Optionally adjust for long strings.

        # After agreeing beginning chars, at least two more must agree and
        # the agreeing characters must be > .5 of remaining characters.
        if (long_strings and (minv > 4) and (num_com > i+1) and
                (2*num_com >= minv+i)):
            if not ying[0].isdigit():
                weight += (1.0-weight) * ((num_com-i-1) /
                                          (len(ying)+len(yang)-i*2+2))

    return weight


def dist_strcmp95(src, tar, long_strings=False):
    """Return the strcmp95 distance between two strings.

    strcmp95 distance is the complement of strcmp95 similarity:
    :math:`dist_{strcmp95} = 1 - sim_{strcmp95}`.

    :param str src: source string for comparison
    :param str tar: target string for comparison
    :param bool long_strings: set to True to "Increase the probability of a
        match when the number of matched characters is large.  This option
        allows for a little more tolerance when the strings are large. It is
        not an appropriate test when comparing fixed length fields such as
        phone and social security numbers."
    :returns: strcmp95 distance
    :rtype: float

    >>> round(dist_strcmp95('cat', 'hat'), 12)
    0.222222222222
    >>> round(dist_strcmp95('Niall', 'Neil'), 12)
    0.1545
    >>> round(dist_strcmp95('aluminum', 'Catalan'), 12)
    0.345238095238
    >>> round(dist_strcmp95('ATCG', 'TAGC'), 12)
    0.166666666667
    """
    return 1 - sim_strcmp95(src, tar, long_strings)


def sim_jaro_winkler(src, tar, qval=1, mode='winkler', long_strings=False,
                     boost_threshold=0.7, scaling_factor=0.1):
    """Return the Jaro or Jaro-Winkler similarity of two strings.

    Jaro(-Winkler) distance is a string edit distance initially proposed by
    Jaro and extended by Winkler :cite:`Jaro:1989,Winkler:1990`.

    This is Python based on the C code for strcmp95:
    http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
    :cite:`Winkler:1994`. The above file is a US Government publication and,
    accordingly, in the public domain.

    :param str src: source string for comparison
    :param str tar: target string for comparison
    :param int qval: the length of each q-gram (defaults to 1: character-wise
        matching)
    :param str mode: indicates which variant of this distance metric to
        compute:

            - 'winkler' -- computes the Jaro-Winkler distance (default) which
              increases the score for matches near the start of the word
            - 'jaro' -- computes the Jaro distance

    The following arguments apply only when mode is 'winkler':

    :param bool long_strings: set to True to "Increase the probability of a
        match when the number of matched characters is large.  This option
        allows for a little more tolerance when the strings are large.  It is
        not an appropriate test when comparing fixed length fields such as
        phone and social security numbers."
    :param float boost_threshold: a value between 0 and 1, below which the
        Winkler boost is not applied (defaults to 0.7)
    :param float scaling_factor: a value between 0 and 0.25, indicating by how
        much to boost scores for matching prefixes (defaults to 0.1)

    :returns: Jaro or Jaro-Winkler similarity
    :rtype: float

    >>> round(sim_jaro_winkler('cat', 'hat'), 12)
    0.777777777778
    >>> round(sim_jaro_winkler('Niall', 'Neil'), 12)
    0.805
    >>> round(sim_jaro_winkler('aluminum', 'Catalan'), 12)
    0.60119047619
    >>> round(sim_jaro_winkler('ATCG', 'TAGC'), 12)
    0.833333333333

    >>> round(sim_jaro_winkler('cat', 'hat', mode='jaro'), 12)
    0.777777777778
    >>> round(sim_jaro_winkler('Niall', 'Neil', mode='jaro'), 12)
    0.783333333333
    >>> round(sim_jaro_winkler('aluminum', 'Catalan', mode='jaro'), 12)
    0.60119047619
    >>> round(sim_jaro_winkler('ATCG', 'TAGC', mode='jaro'), 12)
    0.833333333333
    """
    if mode == 'winkler':
        if boost_threshold > 1 or boost_threshold < 0:
            raise ValueError('Unsupported boost_threshold assignment; ' +
                             'boost_threshold must be between 0 and 1.')
        if scaling_factor > 0.25 or scaling_factor < 0:
            raise ValueError('Unsupported scaling_factor assignment; ' +
                             'scaling_factor must be between 0 and 0.25.')

    if src == tar:
        return 1.0

    src = QGrams(src.strip(), qval).ordered_list
    tar = QGrams(tar.strip(), qval).ordered_list

    lens = len(src)
    lent = len(tar)

    # If either string is blank - return - added in Version 2
    if lens == 0 or lent == 0:
        return 0.0

    if lens > lent:
        search_range = lens
        minv = lent
    else:
        search_range = lent
        minv = lens

    # Zero out the flags
    src_flag = [0] * search_range
    tar_flag = [0] * search_range
    search_range = max(0, search_range//2 - 1)

    # Looking only within the search range, count and flag the matched pairs.
    num_com = 0
    yl1 = lent - 1
    for i in range(lens):
        low_lim = (i - search_range) if (i >= search_range) else 0
        hi_lim = (i + search_range) if ((i + search_range) <= yl1) else yl1
        for j in range(low_lim, hi_lim+1):
            if (tar_flag[j] == 0) and (tar[j] == src[i]):
                tar_flag[j] = 1
                src_flag[i] = 1
                num_com += 1
                break

    # If no characters in common - return
    if num_com == 0:
        return 0.0

    # Count the number of transpositions
    k = n_trans = 0
    for i in range(lens):
        if src_flag[i] != 0:
            j = 0
            for j in range(k, lent):  # pragma: no branch
                if tar_flag[j] != 0:
                    k = j + 1
                    break
            if src[i] != tar[j]:
                n_trans += 1
    n_trans //= 2

    # Main weight computation for Jaro distance
    weight = num_com / lens + num_com / lent + (num_com - n_trans) / num_com
    weight /= 3.0

    # Continue to boost the weight if the strings are similar
    # This is the Winkler portion of Jaro-Winkler distance
    if mode == 'winkler' and weight > boost_threshold:

        # Adjust for having up to the first 4 characters in common
        j = 4 if (minv >= 4) else minv
        i = 0
        while (i < j) and (src[i] == tar[i]):
            i += 1
        weight += i * scaling_factor * (1.0 - weight)

        # Optionally adjust for long strings.

        # After agreeing beginning chars, at least two more must agree and
        # the agreeing characters must be > .5 of remaining characters.
        if (long_strings and (minv > 4) and (num_com > i+1) and
                (2*num_com >= minv+i)):
            weight += (1.0-weight) * ((num_com-i-1) / (lens+lent-i*2+2))

    return weight


def dist_jaro_winkler(src, tar, qval=1, mode='winkler', long_strings=False,
                      boost_threshold=0.7, scaling_factor=0.1):
    """Return the Jaro or Jaro-Winkler distance between two strings.

    Jaro(-Winkler) similarity is the complement of Jaro(-Winkler) distance:
    :math:`sim_{Jaro(-Winkler)} = 1 - dist_{Jaro(-Winkler)}`.

    :param str src: source string for comparison
    :param str tar: target string for comparison
    :param int qval: the length of each q-gram (defaults to 1: character-wise
        matching)
    :param str mode: indicates which variant of this distance metric to
        compute:

            - 'winkler' -- computes the Jaro-Winkler distance (default) which
              increases the score for matches near the start of the word
            - 'jaro' -- computes the Jaro distance

    The following arguments apply only when mode is 'winkler':

    :param bool long_strings: set to True to "Increase the probability of a
        match when the number of matched characters is large.  This option
        allows for a little more tolerance when the strings are large.  It is
        not an appropriate test when comparing fixed length fields such as
        phone and social security numbers."
    :param float boost_threshold: a value between 0 and 1, below which the
        Winkler boost is not applied (defaults to 0.7)
    :param float scaling_factor: a value between 0 and 0.25, indicating by how
        much to boost scores for matching prefixes (defaults to 0.1)

    :returns: Jaro or Jaro-Winkler distance
    :rtype: float

    >>> round(dist_jaro_winkler('cat', 'hat'), 12)
    0.222222222222
    >>> round(dist_jaro_winkler('Niall', 'Neil'), 12)
    0.195
    >>> round(dist_jaro_winkler('aluminum', 'Catalan'), 12)
    0.39880952381
    >>> round(dist_jaro_winkler('ATCG', 'TAGC'), 12)
    0.166666666667

    >>> round(dist_jaro_winkler('cat', 'hat', mode='jaro'), 12)
    0.222222222222
    >>> round(dist_jaro_winkler('Niall', 'Neil', mode='jaro'), 12)
    0.216666666667
    >>> round(dist_jaro_winkler('aluminum', 'Catalan', mode='jaro'), 12)
    0.39880952381
    >>> round(dist_jaro_winkler('ATCG', 'TAGC', mode='jaro'), 12)
    0.166666666667
    """
    return 1 - sim_jaro_winkler(src, tar, qval, mode, long_strings,
                                boost_threshold, scaling_factor)


if __name__ == '__main__':
    import doctest
    doctest.testmod()


1			# -- coding: utf-8 --
2
3			# Copyright 2014-2018 by Christopher C. Little.
4			# This file is part of Abydos.
5			#
6			# Abydos is free software: you can redistribute it and/or modify
7			# it under the terms of the GNU General Public License as published by
8			# the Free Software Foundation, either version 3 of the License, or
9			# (at your option) any later version.
10			#
11			# Abydos is distributed in the hope that it will be useful,
12			# but WITHOUT ANY WARRANTY; without even the implied warranty of
13			# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14			# GNU General Public License for more details.
15			#
16			# You should have received a copy of the GNU General Public License
17			# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19			"""abydos.distance.jaro.
20
21			The distance.jaro module implements distance metrics based on
22			:cite:`Jaro:1989` and subsequent works:
23
24			- Jaro distance
25			- Jaro-Winkler distance
26			- the strcmp95 algorithm variant of Jaro-Winkler distance
27			"""
28
29			from __future__ import division, unicode_literals
30
31			from collections import defaultdict
32
33			from six.moves import range
34
35			from ..tokenizer.qgram import QGrams
36
37
38			__all__ = ['dist_jaro_winkler', 'dist_strcmp95', 'sim_jaro_winkler',
39			'sim_strcmp95']
40
41
42			def sim_strcmp95(src, tar, long_strings=False):
43			"""Return the strcmp95 similarity of two strings.
44
45			This is a Python translation of the C code for strcmp95:
46			http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
47			:cite:`Winkler:1994`.
48			The above file is a US Government publication and, accordingly,
49			in the public domain.
50
51			This is based on the Jaro-Winkler distance, but also attempts to correct
52			for some common typos and frequently confused characters. It is also
53			limited to uppercase ASCII characters, so it is appropriate to American
54			names, but not much else.
55
56			:param str src: source string for comparison
57			:param str tar: target string for comparison
58			:param bool long_strings: set to True to "Increase the probability of a
59			match when the number of matched characters is large. This option
60			allows for a little more tolerance when the strings are large. It is
61			not an appropriate test when comparing fixed length fields such as
62			phone and social security numbers."
63			:returns: strcmp95 similarity
64			:rtype: float
65
66			>>> sim_strcmp95('cat', 'hat')
67			0.7777777777777777
68			>>> sim_strcmp95('Niall', 'Neil')
69			0.8454999999999999
70			>>> sim_strcmp95('aluminum', 'Catalan')
71			0.6547619047619048
72			>>> sim_strcmp95('ATCG', 'TAGC')
73			0.8333333333333334
74			"""
75			def _in_range(char):
76			"""Return True if char is in the range (0, 91)."""
77			return 91 > ord(char) > 0
78
79			ying = src.strip().upper()
80			yang = tar.strip().upper()
81
82			if ying == yang:
83			return 1.0
84			# If either string is blank - return - added in Version 2
85			if not ying or not yang:
86			return 0.0
87
88			adjwt = defaultdict(int)
89			sp_mx = (
90			('A', 'E'), ('A', 'I'), ('A', 'O'), ('A', 'U'), ('B', 'V'), ('E', 'I'),
91			('E', 'O'), ('E', 'U'), ('I', 'O'), ('I', 'U'), ('O', 'U'), ('I', 'Y'),
92			('E', 'Y'), ('C', 'G'), ('E', 'F'), ('W', 'U'), ('W', 'V'), ('X', 'K'),
93			('S', 'Z'), ('X', 'S'), ('Q', 'C'), ('U', 'V'), ('M', 'N'), ('L', 'I'),
94			('Q', 'O'), ('P', 'R'), ('I', 'J'), ('2', 'Z'), ('5', 'S'), ('8', 'B'),
95			('1', 'I'), ('1', 'L'), ('0', 'O'), ('0', 'Q'), ('C', 'K'), ('G', 'J')
96			)
97
98			# Initialize the adjwt array on the first call to the function only.
99			# The adjwt array is used to give partial credit for characters that
100			# may be errors due to known phonetic or character recognition errors.
101			# A typical example is to match the letter "O" with the number "0"
102			for i in sp_mx:
103			adjwt[(i[0], i[1])] = 3
104			adjwt[(i[1], i[0])] = 3
105
106			if len(ying) > len(yang):
107			search_range = len(ying)
108			minv = len(yang)
109			else:
110			search_range = len(yang)
111			minv = len(ying)
112
113			# Blank out the flags
114			ying_flag = [0] * search_range
115			yang_flag = [0] * search_range
116			search_range = max(0, search_range // 2 - 1)
117
118			# Looking only within the search range, count and flag the matched pairs.
119			num_com = 0
120			yl1 = len(yang) - 1
121			for i in range(len(ying)):
122			low_lim = (i - search_range) if (i >= search_range) else 0
123			hi_lim = (i + search_range) if ((i + search_range) <= yl1) else yl1
124			for j in range(low_lim, hi_lim+1):
125			if (yang_flag[j] == 0) and (yang[j] == ying[i]):
126			yang_flag[j] = 1
127			ying_flag[i] = 1
128			num_com += 1
129			break
130
131			# If no characters in common - return
132			if num_com == 0:
133			return 0.0
134
135			# Count the number of transpositions
136			k = n_trans = 0
137			for i in range(len(ying)):
138			if ying_flag[i] != 0:
139			j = 0
140			for j in range(k, len(yang)): # pragma: no branch
141			if yang_flag[j] != 0:
142			k = j + 1
143			break
144			if ying[i] != yang[j]:
145			n_trans += 1
146			n_trans //= 2
147
148			# Adjust for similarities in unmatched characters
149			n_simi = 0
150			if minv > num_com:
151			for i in range(len(ying)):
152			if ying_flag[i] == 0 and _in_range(ying[i]):
153			for j in range(len(yang)):
154			if yang_flag[j] == 0 and _in_range(yang[j]):
155			if (ying[i], yang[j]) in adjwt:
156			n_simi += adjwt[(ying[i], yang[j])]
157			yang_flag[j] = 2
158			break
159			num_sim = n_simi/10.0 + num_com
160
161			# Main weight computation
162			weight = num_sim / len(ying) + num_sim / len(yang) + \
163			(num_com - n_trans) / num_com
164			weight /= 3.0
165
166			# Continue to boost the weight if the strings are similar
167			if weight > 0.7:
168
169			# Adjust for having up to the first 4 characters in common
170			j = 4 if (minv >= 4) else minv
171			i = 0
172			while (i < j) and (ying[i] == yang[i]) and (not ying[i].isdigit()):
173			i += 1
174			if i:
175			weight += i * 0.1 * (1.0 - weight)
176
177			# Optionally adjust for long strings.
178
179			# After agreeing beginning chars, at least two more must agree and
180			# the agreeing characters must be > .5 of remaining characters.
181			if (long_strings and (minv > 4) and (num_com > i+1) and
182			(2*num_com >= minv+i)):
183			if not ying[0].isdigit():
184			weight += (1.0-weight) * ((num_com-i-1) /
185			(len(ying)+len(yang)-i*2+2))
186
187			return weight
188
189
190			def dist_strcmp95(src, tar, long_strings=False):
191			"""Return the strcmp95 distance between two strings.
192
193			strcmp95 distance is the complement of strcmp95 similarity:
194			:math:`dist_{strcmp95} = 1 - sim_{strcmp95}`.
195
196			:param str src: source string for comparison
197			:param str tar: target string for comparison
198			:param bool long_strings: set to True to "Increase the probability of a
199			match when the number of matched characters is large. This option
200			allows for a little more tolerance when the strings are large. It is
201			not an appropriate test when comparing fixed length fields such as
202			phone and social security numbers."
203			:returns: strcmp95 distance
204			:rtype: float
205
206			>>> round(dist_strcmp95('cat', 'hat'), 12)
207			0.222222222222
208			>>> round(dist_strcmp95('Niall', 'Neil'), 12)
209			0.1545
210			>>> round(dist_strcmp95('aluminum', 'Catalan'), 12)
211			0.345238095238
212			>>> round(dist_strcmp95('ATCG', 'TAGC'), 12)
213			0.166666666667
214			"""
215			return 1 - sim_strcmp95(src, tar, long_strings)
216
217
218			def sim_jaro_winkler(src, tar, qval=1, mode='winkler', long_strings=False,
219			boost_threshold=0.7, scaling_factor=0.1):
220			"""Return the Jaro or Jaro-Winkler similarity of two strings.
221
222			Jaro(-Winkler) distance is a string edit distance initially proposed by
223			Jaro and extended by Winkler :cite:`Jaro:1989,Winkler:1990`.
224
225			This is Python based on the C code for strcmp95:
226			http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
227			:cite:`Winkler:1994`. The above file is a US Government publication and,
228			accordingly, in the public domain.
229
230			:param str src: source string for comparison
231			:param str tar: target string for comparison
232			:param int qval: the length of each q-gram (defaults to 1: character-wise
233			matching)
234			:param str mode: indicates which variant of this distance metric to
235			compute:
236
237			- 'winkler' -- computes the Jaro-Winkler distance (default) which
238			increases the score for matches near the start of the word
239			- 'jaro' -- computes the Jaro distance
240
241			The following arguments apply only when mode is 'winkler':
242
243			:param bool long_strings: set to True to "Increase the probability of a
244			match when the number of matched characters is large. This option
245			allows for a little more tolerance when the strings are large. It is
246			not an appropriate test when comparing fixed length fields such as
247			phone and social security numbers."
248			:param float boost_threshold: a value between 0 and 1, below which the
249			Winkler boost is not applied (defaults to 0.7)
250			:param float scaling_factor: a value between 0 and 0.25, indicating by how
251			much to boost scores for matching prefixes (defaults to 0.1)
252
253			:returns: Jaro or Jaro-Winkler similarity
254			:rtype: float
255
256			>>> round(sim_jaro_winkler('cat', 'hat'), 12)
257			0.777777777778
258			>>> round(sim_jaro_winkler('Niall', 'Neil'), 12)
259			0.805
260			>>> round(sim_jaro_winkler('aluminum', 'Catalan'), 12)
261			0.60119047619
262			>>> round(sim_jaro_winkler('ATCG', 'TAGC'), 12)
263			0.833333333333
264
265			>>> round(sim_jaro_winkler('cat', 'hat', mode='jaro'), 12)
266			0.777777777778
267			>>> round(sim_jaro_winkler('Niall', 'Neil', mode='jaro'), 12)
268			0.783333333333
269			>>> round(sim_jaro_winkler('aluminum', 'Catalan', mode='jaro'), 12)
270			0.60119047619
271			>>> round(sim_jaro_winkler('ATCG', 'TAGC', mode='jaro'), 12)
272			0.833333333333
273			"""
274			if mode == 'winkler':
275			if boost_threshold > 1 or boost_threshold < 0:
276			raise ValueError('Unsupported boost_threshold assignment; ' +
277			'boost_threshold must be between 0 and 1.')
278			if scaling_factor > 0.25 or scaling_factor < 0:
279			raise ValueError('Unsupported scaling_factor assignment; ' +
280			'scaling_factor must be between 0 and 0.25.')
281
282			if src == tar:
283			return 1.0
284
285			src = QGrams(src.strip(), qval).ordered_list
286			tar = QGrams(tar.strip(), qval).ordered_list
287
288			lens = len(src)
289			lent = len(tar)
290
291			# If either string is blank - return - added in Version 2
292			if lens == 0 or lent == 0:
293			return 0.0
294
295			if lens > lent:
296			search_range = lens
297			minv = lent
298			else:
299			search_range = lent
300			minv = lens
301
302			# Zero out the flags
303			src_flag = [0] * search_range
304			tar_flag = [0] * search_range
305			search_range = max(0, search_range//2 - 1)
306
307			# Looking only within the search range, count and flag the matched pairs.
308			num_com = 0
309			yl1 = lent - 1
310			for i in range(lens):
311			low_lim = (i - search_range) if (i >= search_range) else 0
312			hi_lim = (i + search_range) if ((i + search_range) <= yl1) else yl1
313			for j in range(low_lim, hi_lim+1):
314			if (tar_flag[j] == 0) and (tar[j] == src[i]):
315			tar_flag[j] = 1
316			src_flag[i] = 1
317			num_com += 1
318			break
319
320			# If no characters in common - return
321			if num_com == 0:
322			return 0.0
323
324			# Count the number of transpositions
325			k = n_trans = 0
326			for i in range(lens):
327			if src_flag[i] != 0:
328			j = 0
329			for j in range(k, lent): # pragma: no branch
330			if tar_flag[j] != 0:
331			k = j + 1
332			break
333			if src[i] != tar[j]:
334			n_trans += 1
335			n_trans //= 2
336
337			# Main weight computation for Jaro distance
338			weight = num_com / lens + num_com / lent + (num_com - n_trans) / num_com
339			weight /= 3.0
340
341			# Continue to boost the weight if the strings are similar
342			# This is the Winkler portion of Jaro-Winkler distance
343			if mode == 'winkler' and weight > boost_threshold:
344
345			# Adjust for having up to the first 4 characters in common
346			j = 4 if (minv >= 4) else minv
347			i = 0
348			while (i < j) and (src[i] == tar[i]):
349			i += 1
350			weight += i * scaling_factor * (1.0 - weight)
351
352			# Optionally adjust for long strings.
353
354			# After agreeing beginning chars, at least two more must agree and
355			# the agreeing characters must be > .5 of remaining characters.
356			if (long_strings and (minv > 4) and (num_com > i+1) and
357			(2*num_com >= minv+i)):
358			weight += (1.0-weight) * ((num_com-i-1) / (lens+lent-i*2+2))
359
360			return weight
361
362
363			def dist_jaro_winkler(src, tar, qval=1, mode='winkler', long_strings=False,
364			boost_threshold=0.7, scaling_factor=0.1):
365			"""Return the Jaro or Jaro-Winkler distance between two strings.
366
367			Jaro(-Winkler) similarity is the complement of Jaro(-Winkler) distance:
368			:math:`sim_{Jaro(-Winkler)} = 1 - dist_{Jaro(-Winkler)}`.
369
370			:param str src: source string for comparison
371			:param str tar: target string for comparison
372			:param int qval: the length of each q-gram (defaults to 1: character-wise
373			matching)
374			:param str mode: indicates which variant of this distance metric to
375			compute:
376
377			- 'winkler' -- computes the Jaro-Winkler distance (default) which
378			increases the score for matches near the start of the word
379			- 'jaro' -- computes the Jaro distance
380
381			The following arguments apply only when mode is 'winkler':
382
383			:param bool long_strings: set to True to "Increase the probability of a
384			match when the number of matched characters is large. This option
385			allows for a little more tolerance when the strings are large. It is
386			not an appropriate test when comparing fixed length fields such as
387			phone and social security numbers."
388			:param float boost_threshold: a value between 0 and 1, below which the
389			Winkler boost is not applied (defaults to 0.7)
390			:param float scaling_factor: a value between 0 and 0.25, indicating by how
391			much to boost scores for matching prefixes (defaults to 0.1)
392
393			:returns: Jaro or Jaro-Winkler distance
394			:rtype: float
395
396			>>> round(dist_jaro_winkler('cat', 'hat'), 12)
397			0.222222222222
398			>>> round(dist_jaro_winkler('Niall', 'Neil'), 12)
399			0.195
400			>>> round(dist_jaro_winkler('aluminum', 'Catalan'), 12)
401			0.39880952381
402			>>> round(dist_jaro_winkler('ATCG', 'TAGC'), 12)
403			0.166666666667
404
405			>>> round(dist_jaro_winkler('cat', 'hat', mode='jaro'), 12)
406			0.222222222222
407			>>> round(dist_jaro_winkler('Niall', 'Neil', mode='jaro'), 12)
408			0.216666666667
409			>>> round(dist_jaro_winkler('aluminum', 'Catalan', mode='jaro'), 12)
410			0.39880952381
411			>>> round(dist_jaro_winkler('ATCG', 'TAGC', mode='jaro'), 12)
412			0.166666666667
413			"""
414			return 1 - sim_jaro_winkler(src, tar, qval, mode, long_strings,
415			boost_threshold, scaling_factor)
416
417
418			if __name__ == '__main__':
419			import doctest
420			doctest.testmod()
421

chrislit / abydos

Pull Request — master (#120)

abydos.distance.jaro.sim_strcmp95() F

Complexity

Size

Duplication

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like