Completed
Pull Request — master (#138)
by Chris
14:20
created

abydos.distance._jaro.dist_strcmp95()   A

Complexity

Conditions 1

Size

Total Lines 25
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 25
ccs 2
cts 2
cp 1
rs 10
c 0
b 0
f 0
cc 1
nop 3
crap 1
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance.jaro.
20
21
The distance.jaro module implements distance metrics based on
22
:cite:`Jaro:1989` and subsequent works:
23
24
    - Jaro distance
25
    - Jaro-Winkler distance
26
    - the strcmp95 algorithm variant of Jaro-Winkler distance
27
"""
28
29 1
from __future__ import division, unicode_literals
30
31 1
from collections import defaultdict
32
33 1
from six.moves import range
34
35 1
from ._distance import Distance
36 1
from ..tokenizer import QGrams
37
38 1
__all__ = [
39
    'JaroWinkler',
40
    'Strcmp95',
41
    'dist_jaro_winkler',
42
    'dist_strcmp95',
43
    'sim_jaro_winkler',
44
    'sim_strcmp95',
45
]
46
47
48 1
class Strcmp95(Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
49
    """Strcmp95.
50
51
    This is a Python translation of the C code for strcmp95:
52
    http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
53
    :cite:`Winkler:1994`.
54
    The above file is a US Government publication and, accordingly,
55
    in the public domain.
56
57
    This is based on the Jaro-Winkler distance, but also attempts to correct
58
    for some common typos and frequently confused characters. It is also
59
    limited to uppercase ASCII characters, so it is appropriate to American
60
    names, but not much else.
61
    """
62
63 1
    _sp_mx = (
64
        ('A', 'E'),
65
        ('A', 'I'),
66
        ('A', 'O'),
67
        ('A', 'U'),
68
        ('B', 'V'),
69
        ('E', 'I'),
70
        ('E', 'O'),
71
        ('E', 'U'),
72
        ('I', 'O'),
73
        ('I', 'U'),
74
        ('O', 'U'),
75
        ('I', 'Y'),
76
        ('E', 'Y'),
77
        ('C', 'G'),
78
        ('E', 'F'),
79
        ('W', 'U'),
80
        ('W', 'V'),
81
        ('X', 'K'),
82
        ('S', 'Z'),
83
        ('X', 'S'),
84
        ('Q', 'C'),
85
        ('U', 'V'),
86
        ('M', 'N'),
87
        ('L', 'I'),
88
        ('Q', 'O'),
89
        ('P', 'R'),
90
        ('I', 'J'),
91
        ('2', 'Z'),
92
        ('5', 'S'),
93
        ('8', 'B'),
94
        ('1', 'I'),
95
        ('1', 'L'),
96
        ('0', 'O'),
97
        ('0', 'Q'),
98
        ('C', 'K'),
99
        ('G', 'J'),
100
    )
101
102 1
    def sim(self, src, tar, long_strings=False):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (24/15).
Loading history...
Bug introduced by
Parameters differ from overridden 'sim' method
Loading history...
103
        """Return the strcmp95 similarity of two strings.
104
105
        :param str src: source string for comparison
106
        :param str tar: target string for comparison
107
        :param bool long_strings: set to True to "Increase the probability of a
108
            match when the number of matched characters is large.  This option
109
            allows for a little more tolerance when the strings are large. It
110
            is not an appropriate test when comparing fixed length fields such
111
            as phone and social security numbers."
112
        :returns: strcmp95 similarity
113
        :rtype: float
114
115
        >>> cmp = Strcmp95()
116
        >>> cmp.sim('cat', 'hat')
117
        0.7777777777777777
118
        >>> cmp.sim('Niall', 'Neil')
119
        0.8454999999999999
120
        >>> cmp.sim('aluminum', 'Catalan')
121
        0.6547619047619048
122
        >>> cmp.sim('ATCG', 'TAGC')
123
        0.8333333333333334
124
        """
125
126 1
        def _in_range(char):
127
            """Return True if char is in the range (0, 91)."""
128 1
            return 91 > ord(char) > 0
129
130 1
        ying = src.strip().upper()
131 1
        yang = tar.strip().upper()
132
133 1
        if ying == yang:
134 1
            return 1.0
135
        # If either string is blank - return - added in Version 2
136 1
        if not ying or not yang:
137 1
            return 0.0
138
139 1
        adjwt = defaultdict(int)
140
141
        # Initialize the adjwt array on the first call to the function only.
142
        # The adjwt array is used to give partial credit for characters that
143
        # may be errors due to known phonetic or character recognition errors.
144
        # A typical example is to match the letter "O" with the number "0"
145 1
        for i in self._sp_mx:
146 1
            adjwt[(i[0], i[1])] = 3
147 1
            adjwt[(i[1], i[0])] = 3
148
149 1
        if len(ying) > len(yang):
150 1
            search_range = len(ying)
151 1
            minv = len(yang)
152
        else:
153 1
            search_range = len(yang)
154 1
            minv = len(ying)
155
156
        # Blank out the flags
157 1
        ying_flag = [0] * search_range
158 1
        yang_flag = [0] * search_range
159 1
        search_range = max(0, search_range // 2 - 1)
160
161
        # Looking only within the search range,
162
        # count and flag the matched pairs.
163 1
        num_com = 0
164 1
        yl1 = len(yang) - 1
165 1
        for i in range(len(ying)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
166 1
            low_lim = (i - search_range) if (i >= search_range) else 0
167 1
            hi_lim = (i + search_range) if ((i + search_range) <= yl1) else yl1
168 1
            for j in range(low_lim, hi_lim + 1):
169 1
                if (yang_flag[j] == 0) and (yang[j] == ying[i]):
170 1
                    yang_flag[j] = 1
171 1
                    ying_flag[i] = 1
172 1
                    num_com += 1
173 1
                    break
174
175
        # If no characters in common - return
176 1
        if num_com == 0:
177 1
            return 0.0
178
179
        # Count the number of transpositions
180 1
        k = n_trans = 0
181 1
        for i in range(len(ying)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
182 1
            if ying_flag[i] != 0:
183 1
                j = 0
184 1
                for j in range(k, len(yang)):  # pragma: no branch
185 1
                    if yang_flag[j] != 0:
186 1
                        k = j + 1
187 1
                        break
188 1
                if ying[i] != yang[j]:
189 1
                    n_trans += 1
190 1
        n_trans //= 2
191
192
        # Adjust for similarities in unmatched characters
193 1
        n_simi = 0
194 1
        if minv > num_com:
0 ignored issues
show
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
195 1
            for i in range(len(ying)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
196 1
                if ying_flag[i] == 0 and _in_range(ying[i]):
197 1
                    for j in range(len(yang)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
198 1
                        if yang_flag[j] == 0 and _in_range(yang[j]):
199 1
                            if (ying[i], yang[j]) in adjwt:
200 1
                                n_simi += adjwt[(ying[i], yang[j])]
201 1
                                yang_flag[j] = 2
202 1
                                break
203 1
        num_sim = n_simi / 10.0 + num_com
204
205
        # Main weight computation
206 1
        weight = (
207
            num_sim / len(ying)
208
            + num_sim / len(yang)
209
            + (num_com - n_trans) / num_com
210
        )
211 1
        weight /= 3.0
212
213
        # Continue to boost the weight if the strings are similar
214 1
        if weight > 0.7:
215
216
            # Adjust for having up to the first 4 characters in common
217 1
            j = 4 if (minv >= 4) else minv
218 1
            i = 0
219 1
            while (i < j) and (ying[i] == yang[i]) and (not ying[i].isdigit()):
220 1
                i += 1
221 1
            if i:
222 1
                weight += i * 0.1 * (1.0 - weight)
223
224
            # Optionally adjust for long strings.
225
226
            # After agreeing beginning chars, at least two more must agree and
227
            # the agreeing characters must be > .5 of remaining characters.
228 1
            if (
229
                long_strings
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
230
                and (minv > 4)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
231
                and (num_com > i + 1)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
232
                and (2 * num_com >= minv + i)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
233
            ):
234 1
                if not ying[0].isdigit():
235 1
                    weight += (1.0 - weight) * (
236
                        (num_com - i - 1) / (len(ying) + len(yang) - i * 2 + 2)
237
                    )
238
239 1
        return weight
240
241
242 1
def sim_strcmp95(src, tar, long_strings=False):
243
    """Return the strcmp95 similarity of two strings.
244
245
    This is a wrapper for :py:meth:`Strcmp95.sim`.
246
247
    :param str src: source string for comparison
248
    :param str tar: target string for comparison
249
    :param bool long_strings: set to True to "Increase the probability of a
250
        match when the number of matched characters is large.  This option
251
        allows for a little more tolerance when the strings are large. It is
252
        not an appropriate test when comparing fixed length fields such as
253
        phone and social security numbers."
254
    :returns: strcmp95 similarity
255
    :rtype: float
256
257
    >>> sim_strcmp95('cat', 'hat')
258
    0.7777777777777777
259
    >>> sim_strcmp95('Niall', 'Neil')
260
    0.8454999999999999
261
    >>> sim_strcmp95('aluminum', 'Catalan')
262
    0.6547619047619048
263
    >>> sim_strcmp95('ATCG', 'TAGC')
264
    0.8333333333333334
265
    """
266 1
    return Strcmp95().sim(src, tar, long_strings)
267
268
269 1
def dist_strcmp95(src, tar, long_strings=False):
270
    """Return the strcmp95 distance between two strings.
271
272
    This is a wrapper for :py:meth:`Strcmp95.dist`.
273
274
    :param str src: source string for comparison
275
    :param str tar: target string for comparison
276
    :param bool long_strings: set to True to "Increase the probability of a
277
        match when the number of matched characters is large.  This option
278
        allows for a little more tolerance when the strings are large. It is
279
        not an appropriate test when comparing fixed length fields such as
280
        phone and social security numbers."
281
    :returns: strcmp95 distance
282
    :rtype: float
283
284
    >>> round(dist_strcmp95('cat', 'hat'), 12)
285
    0.222222222222
286
    >>> round(dist_strcmp95('Niall', 'Neil'), 12)
287
    0.1545
288
    >>> round(dist_strcmp95('aluminum', 'Catalan'), 12)
289
    0.345238095238
290
    >>> round(dist_strcmp95('ATCG', 'TAGC'), 12)
291
    0.166666666667
292
    """
293 1
    return Strcmp95().dist(src, tar, long_strings)
294
295
296 1
class JaroWinkler(Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
297
    """Jaro-Winkler distance.
298
299
    Jaro(-Winkler) distance is a string edit distance initially proposed by
300
    Jaro and extended by Winkler :cite:`Jaro:1989,Winkler:1990`.
301
302
    This is Python based on the C code for strcmp95:
303
    http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
304
    :cite:`Winkler:1994`. The above file is a US Government publication and,
305
    accordingly, in the public domain.
306
    """
307
308 1
    def sim(
0 ignored issues
show
best-practice introduced by
Too many arguments (8/5)
Loading history...
Comprehensibility introduced by
This function exceeds the maximum number of variables (24/15).
Loading history...
Bug introduced by
Parameters differ from overridden 'sim' method
Loading history...
309
        self,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
310
        src,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
311
        tar,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
312
        qval=1,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
313
        mode='winkler',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
314
        long_strings=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
315
        boost_threshold=0.7,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
316
        scaling_factor=0.1,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
317
    ):
318
        """Return the Jaro or Jaro-Winkler similarity of two strings.
319
320
        :param str src: source string for comparison
321
        :param str tar: target string for comparison
322
        :param int qval: the length of each q-gram (defaults to 1:
323
            character-wise matching)
324
        :param str mode: indicates which variant of this distance metric to
325
            compute:
326
327
                - 'winkler' -- computes the Jaro-Winkler distance (default)
328
                  which increases the score for matches near the start of the
329
                  word
330
                - 'jaro' -- computes the Jaro distance
331
332
        The following arguments apply only when mode is 'winkler':
333
334
        :param bool long_strings: set to True to "Increase the probability of a
335
            match when the number of matched characters is large.  This option
336
            allows for a little more tolerance when the strings are large.  It
337
            is not an appropriate test when comparing fixed length fields such
338
            as phone and social security numbers."
339
        :param float boost_threshold: a value between 0 and 1, below which the
340
            Winkler boost is not applied (defaults to 0.7)
341
        :param float scaling_factor: a value between 0 and 0.25, indicating by
342
            how much to boost scores for matching prefixes (defaults to 0.1)
343
344
        :returns: Jaro or Jaro-Winkler similarity
345
        :rtype: float
346
347
        >>> round(sim_jaro_winkler('cat', 'hat'), 12)
348
        0.777777777778
349
        >>> round(sim_jaro_winkler('Niall', 'Neil'), 12)
350
        0.805
351
        >>> round(sim_jaro_winkler('aluminum', 'Catalan'), 12)
352
        0.60119047619
353
        >>> round(sim_jaro_winkler('ATCG', 'TAGC'), 12)
354
        0.833333333333
355
356
        >>> round(sim_jaro_winkler('cat', 'hat', mode='jaro'), 12)
357
        0.777777777778
358
        >>> round(sim_jaro_winkler('Niall', 'Neil', mode='jaro'), 12)
359
        0.783333333333
360
        >>> round(sim_jaro_winkler('aluminum', 'Catalan', mode='jaro'), 12)
361
        0.60119047619
362
        >>> round(sim_jaro_winkler('ATCG', 'TAGC', mode='jaro'), 12)
363
        0.833333333333
364
        """
365 1
        if mode == 'winkler':
366 1
            if boost_threshold > 1 or boost_threshold < 0:
367 1
                raise ValueError(
368
                    'Unsupported boost_threshold assignment; '
369
                    + 'boost_threshold must be between 0 and 1.'
370
                )
371 1
            if scaling_factor > 0.25 or scaling_factor < 0:
372 1
                raise ValueError(
373
                    'Unsupported scaling_factor assignment; '
374
                    + 'scaling_factor must be between 0 and 0.25.'
375
                )
376
377 1
        if src == tar:
378 1
            return 1.0
379
380 1
        src = QGrams(src.strip(), qval).ordered_list
381 1
        tar = QGrams(tar.strip(), qval).ordered_list
382
383 1
        lens = len(src)
384 1
        lent = len(tar)
385
386
        # If either string is blank - return - added in Version 2
387 1
        if lens == 0 or lent == 0:
388 1
            return 0.0
389
390 1
        if lens > lent:
391 1
            search_range = lens
392 1
            minv = lent
393
        else:
394 1
            search_range = lent
395 1
            minv = lens
396
397
        # Zero out the flags
398 1
        src_flag = [0] * search_range
399 1
        tar_flag = [0] * search_range
400 1
        search_range = max(0, search_range // 2 - 1)
401
402
        # Looking only within the search range,
403
        # count and flag the matched pairs.
404 1
        num_com = 0
405 1
        yl1 = lent - 1
406 1
        for i in range(lens):
407 1
            low_lim = (i - search_range) if (i >= search_range) else 0
408 1
            hi_lim = (i + search_range) if ((i + search_range) <= yl1) else yl1
409 1
            for j in range(low_lim, hi_lim + 1):
410 1
                if (tar_flag[j] == 0) and (tar[j] == src[i]):
411 1
                    tar_flag[j] = 1
412 1
                    src_flag[i] = 1
413 1
                    num_com += 1
414 1
                    break
415
416
        # If no characters in common - return
417 1
        if num_com == 0:
418 1
            return 0.0
419
420
        # Count the number of transpositions
421 1
        k = n_trans = 0
422 1
        for i in range(lens):
423 1
            if src_flag[i] != 0:
424 1
                j = 0
425 1
                for j in range(k, lent):  # pragma: no branch
426 1
                    if tar_flag[j] != 0:
427 1
                        k = j + 1
428 1
                        break
429 1
                if src[i] != tar[j]:
430 1
                    n_trans += 1
431 1
        n_trans //= 2
432
433
        # Main weight computation for Jaro distance
434 1
        weight = (
435
            num_com / lens + num_com / lent + (num_com - n_trans) / num_com
436
        )
437 1
        weight /= 3.0
438
439
        # Continue to boost the weight if the strings are similar
440
        # This is the Winkler portion of Jaro-Winkler distance
441 1
        if mode == 'winkler' and weight > boost_threshold:
442
443
            # Adjust for having up to the first 4 characters in common
444 1
            j = 4 if (minv >= 4) else minv
445 1
            i = 0
446 1
            while (i < j) and (src[i] == tar[i]):
447 1
                i += 1
448 1
            weight += i * scaling_factor * (1.0 - weight)
449
450
            # Optionally adjust for long strings.
451
452
            # After agreeing beginning chars, at least two more must agree and
453
            # the agreeing characters must be > .5 of remaining characters.
454 1
            if (
455
                long_strings
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
456
                and (minv > 4)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
457
                and (num_com > i + 1)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
458
                and (2 * num_com >= minv + i)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
459
            ):
460 1
                weight += (1.0 - weight) * (
461
                    (num_com - i - 1) / (lens + lent - i * 2 + 2)
462
                )
463
464 1
        return weight
465
466
467 1
def sim_jaro_winkler(
0 ignored issues
show
best-practice introduced by
Too many arguments (7/5)
Loading history...
468
    src,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
469
    tar,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
470
    qval=1,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
471
    mode='winkler',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
472
    long_strings=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
473
    boost_threshold=0.7,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
474
    scaling_factor=0.1,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
475
):
476
    """Return the Jaro or Jaro-Winkler similarity of two strings.
477
478
    This is a wrapper for :py:meth:`JaroWinkler.sim`.
479
480
    :param str src: source string for comparison
481
    :param str tar: target string for comparison
482
    :param int qval: the length of each q-gram (defaults to 1: character-wise
483
        matching)
484
    :param str mode: indicates which variant of this distance metric to
485
        compute:
486
487
            - 'winkler' -- computes the Jaro-Winkler distance (default) which
488
              increases the score for matches near the start of the word
489
            - 'jaro' -- computes the Jaro distance
490
491
    The following arguments apply only when mode is 'winkler':
492
493
    :param bool long_strings: set to True to "Increase the probability of a
494
        match when the number of matched characters is large.  This option
495
        allows for a little more tolerance when the strings are large.  It is
496
        not an appropriate test when comparing fixed length fields such as
497
        phone and social security numbers."
498
    :param float boost_threshold: a value between 0 and 1, below which the
499
        Winkler boost is not applied (defaults to 0.7)
500
    :param float scaling_factor: a value between 0 and 0.25, indicating by how
501
        much to boost scores for matching prefixes (defaults to 0.1)
502
503
    :returns: Jaro or Jaro-Winkler similarity
504
    :rtype: float
505
506
    >>> round(sim_jaro_winkler('cat', 'hat'), 12)
507
    0.777777777778
508
    >>> round(sim_jaro_winkler('Niall', 'Neil'), 12)
509
    0.805
510
    >>> round(sim_jaro_winkler('aluminum', 'Catalan'), 12)
511
    0.60119047619
512
    >>> round(sim_jaro_winkler('ATCG', 'TAGC'), 12)
513
    0.833333333333
514
515
    >>> round(sim_jaro_winkler('cat', 'hat', mode='jaro'), 12)
516
    0.777777777778
517
    >>> round(sim_jaro_winkler('Niall', 'Neil', mode='jaro'), 12)
518
    0.783333333333
519
    >>> round(sim_jaro_winkler('aluminum', 'Catalan', mode='jaro'), 12)
520
    0.60119047619
521
    >>> round(sim_jaro_winkler('ATCG', 'TAGC', mode='jaro'), 12)
522
    0.833333333333
523
    """
524 1
    return JaroWinkler().sim(
525
        src, tar, qval, mode, long_strings, boost_threshold, scaling_factor
526
    )
527
528
529 1
def dist_jaro_winkler(
0 ignored issues
show
best-practice introduced by
Too many arguments (7/5)
Loading history...
530
    src,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
531
    tar,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
532
    qval=1,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
533
    mode='winkler',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
534
    long_strings=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
535
    boost_threshold=0.7,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
536
    scaling_factor=0.1,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
537
):
538
    """Return the Jaro or Jaro-Winkler distance between two strings.
539
540
    This is a wrapper for :py:meth:`JaroWinkler.dist`.
541
542
    :param str src: source string for comparison
543
    :param str tar: target string for comparison
544
    :param int qval: the length of each q-gram (defaults to 1: character-wise
545
        matching)
546
    :param str mode: indicates which variant of this distance metric to
547
        compute:
548
549
            - 'winkler' -- computes the Jaro-Winkler distance (default) which
550
              increases the score for matches near the start of the word
551
            - 'jaro' -- computes the Jaro distance
552
553
    The following arguments apply only when mode is 'winkler':
554
555
    :param bool long_strings: set to True to "Increase the probability of a
556
        match when the number of matched characters is large.  This option
557
        allows for a little more tolerance when the strings are large.  It is
558
        not an appropriate test when comparing fixed length fields such as
559
        phone and social security numbers."
560
    :param float boost_threshold: a value between 0 and 1, below which the
561
        Winkler boost is not applied (defaults to 0.7)
562
    :param float scaling_factor: a value between 0 and 0.25, indicating by how
563
        much to boost scores for matching prefixes (defaults to 0.1)
564
565
    :returns: Jaro or Jaro-Winkler distance
566
    :rtype: float
567
568
    >>> round(dist_jaro_winkler('cat', 'hat'), 12)
569
    0.222222222222
570
    >>> round(dist_jaro_winkler('Niall', 'Neil'), 12)
571
    0.195
572
    >>> round(dist_jaro_winkler('aluminum', 'Catalan'), 12)
573
    0.39880952381
574
    >>> round(dist_jaro_winkler('ATCG', 'TAGC'), 12)
575
    0.166666666667
576
577
    >>> round(dist_jaro_winkler('cat', 'hat', mode='jaro'), 12)
578
    0.222222222222
579
    >>> round(dist_jaro_winkler('Niall', 'Neil', mode='jaro'), 12)
580
    0.216666666667
581
    >>> round(dist_jaro_winkler('aluminum', 'Catalan', mode='jaro'), 12)
582
    0.39880952381
583
    >>> round(dist_jaro_winkler('ATCG', 'TAGC', mode='jaro'), 12)
584
    0.166666666667
585
    """
586 1
    return JaroWinkler().dist(
587
        src, tar, qval, mode, long_strings, boost_threshold, scaling_factor
588
    )
589
590
591
if __name__ == '__main__':
592
    import doctest
593
594
    doctest.testmod()
595