Completed
Push — master ( f43547...71985b )
by Chris
12:00 queued 10s
created

abydos.distance._strcmp95.dist_strcmp95()   A

Complexity

Conditions 1

Size

Total Lines 36
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 3
dl 0
loc 36
ccs 2
cts 2
cp 1
crap 1
rs 10
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._strcmp95.
20
21
The strcmp95 algorithm variant of Jaro-Winkler distance
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from collections import defaultdict
32
33 1
from six.moves import range
34
35 1
from ._distance import _Distance
36
37 1
__all__ = ['Strcmp95', 'dist_strcmp95', 'sim_strcmp95']
38
39
40 1
class Strcmp95(_Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
41
    """Strcmp95.
42
43
    This is a Python translation of the C code for strcmp95:
44
    http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
45
    :cite:`Winkler:1994`.
46
    The above file is a US Government publication and, accordingly,
47
    in the public domain.
48
49
    This is based on the Jaro-Winkler distance, but also attempts to correct
50
    for some common typos and frequently confused characters. It is also
51
    limited to uppercase ASCII characters, so it is appropriate to American
52
    names, but not much else.
53
    """
54
55 1
    _sp_mx = (
56
        ('A', 'E'),
57
        ('A', 'I'),
58
        ('A', 'O'),
59
        ('A', 'U'),
60
        ('B', 'V'),
61
        ('E', 'I'),
62
        ('E', 'O'),
63
        ('E', 'U'),
64
        ('I', 'O'),
65
        ('I', 'U'),
66
        ('O', 'U'),
67
        ('I', 'Y'),
68
        ('E', 'Y'),
69
        ('C', 'G'),
70
        ('E', 'F'),
71
        ('W', 'U'),
72
        ('W', 'V'),
73
        ('X', 'K'),
74
        ('S', 'Z'),
75
        ('X', 'S'),
76
        ('Q', 'C'),
77
        ('U', 'V'),
78
        ('M', 'N'),
79
        ('L', 'I'),
80
        ('Q', 'O'),
81
        ('P', 'R'),
82
        ('I', 'J'),
83
        ('2', 'Z'),
84
        ('5', 'S'),
85
        ('8', 'B'),
86
        ('1', 'I'),
87
        ('1', 'L'),
88
        ('0', 'O'),
89
        ('0', 'Q'),
90
        ('C', 'K'),
91
        ('G', 'J'),
92
    )
93
94 1
    def sim(self, src, tar, long_strings=False):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (24/15).
Loading history...
Bug introduced by
Parameters differ from overridden 'sim' method
Loading history...
95
        """Return the strcmp95 similarity of two strings.
96
97
        Parameters
98
        ----------
99
        src : str
100
            Source string for comparison
101
        tar : str
102
            Target string for comparison
103
        long_strings : bool
104
            Set to True to increase the probability of a match when the number
105
            of matched characters is large. This option allows for a little
106
            more tolerance when the strings are large. It is not an appropriate
107
            test when comparing fixed length fields such as phone and social
108
            security numbers.
109
110
        Returns
111
        -------
112
        float
113
            Strcmp95 similarity
114
115
        Examples
116
        --------
117
        >>> cmp = Strcmp95()
118
        >>> cmp.sim('cat', 'hat')
119
        0.7777777777777777
120
        >>> cmp.sim('Niall', 'Neil')
121
        0.8454999999999999
122
        >>> cmp.sim('aluminum', 'Catalan')
123
        0.6547619047619048
124
        >>> cmp.sim('ATCG', 'TAGC')
125
        0.8333333333333334
126
127
        """
128
129 1
        def _in_range(char):
130
            """Return True if char is in the range (0, 91).
131
132
            Parameters
133
            ----------
134
            char : str
135
                The character to check
136
137
            Returns
138
            -------
139
            bool
140
                True if char is in the range (0, 91)
141
142
            """
143 1
            return 91 > ord(char) > 0
144
145 1
        ying = src.strip().upper()
146 1
        yang = tar.strip().upper()
147
148 1
        if ying == yang:
149 1
            return 1.0
150
        # If either string is blank - return - added in Version 2
151 1
        if not ying or not yang:
152 1
            return 0.0
153
154 1
        adjwt = defaultdict(int)
155
156
        # Initialize the adjwt array on the first call to the function only.
157
        # The adjwt array is used to give partial credit for characters that
158
        # may be errors due to known phonetic or character recognition errors.
159
        # A typical example is to match the letter "O" with the number "0"
160 1
        for i in self._sp_mx:
161 1
            adjwt[(i[0], i[1])] = 3
162 1
            adjwt[(i[1], i[0])] = 3
163
164 1
        if len(ying) > len(yang):
165 1
            search_range = len(ying)
166 1
            minv = len(yang)
167
        else:
168 1
            search_range = len(yang)
169 1
            minv = len(ying)
170
171
        # Blank out the flags
172 1
        ying_flag = [0] * search_range
173 1
        yang_flag = [0] * search_range
174 1
        search_range = max(0, search_range // 2 - 1)
175
176
        # Looking only within the search range,
177
        # count and flag the matched pairs.
178 1
        num_com = 0
179 1
        yl1 = len(yang) - 1
180 1
        for i in range(len(ying)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
181 1
            low_lim = (i - search_range) if (i >= search_range) else 0
182 1
            hi_lim = (i + search_range) if ((i + search_range) <= yl1) else yl1
183 1
            for j in range(low_lim, hi_lim + 1):
184 1
                if (yang_flag[j] == 0) and (yang[j] == ying[i]):
185 1
                    yang_flag[j] = 1
186 1
                    ying_flag[i] = 1
187 1
                    num_com += 1
188 1
                    break
189
190
        # If no characters in common - return
191 1
        if num_com == 0:
192 1
            return 0.0
193
194
        # Count the number of transpositions
195 1
        k = n_trans = 0
196 1
        for i in range(len(ying)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
197 1
            if ying_flag[i] != 0:
198 1
                j = 0
199 1
                for j in range(k, len(yang)):  # pragma: no branch
200 1
                    if yang_flag[j] != 0:
201 1
                        k = j + 1
202 1
                        break
203 1
                if ying[i] != yang[j]:
204 1
                    n_trans += 1
205 1
        n_trans //= 2
206
207
        # Adjust for similarities in unmatched characters
208 1
        n_simi = 0
209 1
        if minv > num_com:
0 ignored issues
show
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
210 1
            for i in range(len(ying)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
211 1
                if ying_flag[i] == 0 and _in_range(ying[i]):
212 1
                    for j in range(len(yang)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
213 1
                        if yang_flag[j] == 0 and _in_range(yang[j]):
214 1
                            if (ying[i], yang[j]) in adjwt:
215 1
                                n_simi += adjwt[(ying[i], yang[j])]
216 1
                                yang_flag[j] = 2
217 1
                                break
218 1
        num_sim = n_simi / 10.0 + num_com
219
220
        # Main weight computation
221 1
        weight = (
222
            num_sim / len(ying)
223
            + num_sim / len(yang)
224
            + (num_com - n_trans) / num_com
225
        )
226 1
        weight /= 3.0
227
228
        # Continue to boost the weight if the strings are similar
229 1
        if weight > 0.7:
230
231
            # Adjust for having up to the first 4 characters in common
232 1
            j = 4 if (minv >= 4) else minv
233 1
            i = 0
234 1
            while (i < j) and (ying[i] == yang[i]) and (not ying[i].isdigit()):
235 1
                i += 1
236 1
            if i:
237 1
                weight += i * 0.1 * (1.0 - weight)
238
239
            # Optionally adjust for long strings.
240
241
            # After agreeing beginning chars, at least two more must agree and
242
            # the agreeing characters must be > .5 of remaining characters.
243 1
            if (
244
                long_strings
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
245
                and (minv > 4)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
246
                and (num_com > i + 1)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
247
                and (2 * num_com >= minv + i)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
248
            ):
249 1
                if not ying[0].isdigit():
250 1
                    weight += (1.0 - weight) * (
251
                        (num_com - i - 1) / (len(ying) + len(yang) - i * 2 + 2)
252
                    )
253
254 1
        return weight
255
256
257 1
def sim_strcmp95(src, tar, long_strings=False):
258
    """Return the strcmp95 similarity of two strings.
259
260
    This is a wrapper for :py:meth:`Strcmp95.sim`.
261
262
    Parameters
263
    ----------
264
    src : str
265
        Source string for comparison
266
    tar : str
267
        Target string for comparison
268
    long_strings : bool
269
        Set to True to increase the probability of a match when the number of
270
        matched characters is large. This option allows for a little more
271
        tolerance when the strings are large. It is not an appropriate test
272
        when comparing fixed length fields such as phone and social security
273
        numbers.
274
275
    Returns
276
    -------
277
    float
278
        Strcmp95 similarity
279
280
    Examples
281
    --------
282
    >>> sim_strcmp95('cat', 'hat')
283
    0.7777777777777777
284
    >>> sim_strcmp95('Niall', 'Neil')
285
    0.8454999999999999
286
    >>> sim_strcmp95('aluminum', 'Catalan')
287
    0.6547619047619048
288
    >>> sim_strcmp95('ATCG', 'TAGC')
289
    0.8333333333333334
290
291
    """
292 1
    return Strcmp95().sim(src, tar, long_strings)
293
294
295 1
def dist_strcmp95(src, tar, long_strings=False):
296
    """Return the strcmp95 distance between two strings.
297
298
    This is a wrapper for :py:meth:`Strcmp95.dist`.
299
300
    Parameters
301
    ----------
302
    src : str
303
        Source string for comparison
304
    tar : str
305
        Target string for comparison
306
    long_strings : bool
307
        Set to True to increase the probability of a match when the number of
308
        matched characters is large. This option allows for a little more
309
        tolerance when the strings are large. It is not an appropriate test
310
        when comparing fixed length fields such as phone and social security
311
        numbers.
312
313
    Returns
314
    -------
315
    float
316
        Strcmp95 distance
317
318
    Examples
319
    --------
320
    >>> round(dist_strcmp95('cat', 'hat'), 12)
321
    0.222222222222
322
    >>> round(dist_strcmp95('Niall', 'Neil'), 12)
323
    0.1545
324
    >>> round(dist_strcmp95('aluminum', 'Catalan'), 12)
325
    0.345238095238
326
    >>> round(dist_strcmp95('ATCG', 'TAGC'), 12)
327
    0.166666666667
328
329
    """
330 1
    return Strcmp95().dist(src, tar, long_strings)
331
332
333
if __name__ == '__main__':
334
    import doctest
335
336
    doctest.testmod()
337