abydos.distance._strcmp95.Strcmp95.__init__()   A
last analyzed

Complexity

Conditions 1

Size

Total Lines 20
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 1
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 3
dl 0
loc 20
ccs 1
cts 1
cp 1
rs 10
c 0
b 0
f 0
cc 1
nop 3
crap 1
1
# Copyright 2014-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.distance._strcmp95.
18
19 1
The strcmp95 algorithm variant of Jaro-Winkler distance
20
"""
21
22
from collections import defaultdict
23
from typing import Any, DefaultDict, Tuple
24 1
25
from ._distance import _Distance
26
27
__all__ = ['Strcmp95']
28
29
30
class Strcmp95(_Distance):
31 1
    """Strcmp95.
32
33 1
    This is a Python translation of the C code for strcmp95:
34
    http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
35 1
    :cite:`Winkler:1994`.
36
    The above file is a US Government publication and, accordingly,
37 1
    in the public domain.
38 1
39
    This is based on the Jaro-Winkler distance, but also attempts to correct
40 1
    for some common typos and frequently confused characters. It is also
41
    limited to uppercase ASCII characters, so it is appropriate to American
42
    names, but not much else.
43 1
44
    .. versionadded:: 0.3.6
45
    """
46
47
    _sp_mx = (
48
        ('A', 'E'),
49
        ('A', 'I'),
50
        ('A', 'O'),
51
        ('A', 'U'),
52
        ('B', 'V'),
53
        ('E', 'I'),
54
        ('E', 'O'),
55
        ('E', 'U'),
56
        ('I', 'O'),
57
        ('I', 'U'),
58
        ('O', 'U'),
59
        ('I', 'Y'),
60 1
        ('E', 'Y'),
61
        ('C', 'G'),
62
        ('E', 'F'),
63
        ('W', 'U'),
64
        ('W', 'V'),
65
        ('X', 'K'),
66
        ('S', 'Z'),
67
        ('X', 'S'),
68
        ('Q', 'C'),
69
        ('U', 'V'),
70
        ('M', 'N'),
71
        ('L', 'I'),
72
        ('Q', 'O'),
73
        ('P', 'R'),
74
        ('I', 'J'),
75
        ('2', 'Z'),
76
        ('5', 'S'),
77
        ('8', 'B'),
78
        ('1', 'I'),
79
        ('1', 'L'),
80
        ('0', 'O'),
81
        ('0', 'Q'),
82
        ('C', 'K'),
83
        ('G', 'J'),
84
    )
85
86
    def __init__(self, long_strings: bool = False, **kwargs: Any) -> None:
87
        """Initialize Strcmp95 instance.
88
89
        Parameters
90
        ----------
91
        long_strings : bool
92
            Set to True to increase the probability of a match when the number
93
            of matched characters is large. This option allows for a little
94
            more tolerance when the strings are large. It is not an appropriate
95
            test when comparing fixed length fields such as phone and social
96
            security numbers.
97
        **kwargs
98
            Arbitrary keyword arguments
99 1
100
101
        .. versionadded:: 0.4.0
102
103
        """
104
        super(Strcmp95, self).__init__(**kwargs)
105
        self._long_strings = long_strings
106
107
    def sim(self, src: str, tar: str) -> float:
108
        """Return the strcmp95 similarity of two strings.
109
110
        Parameters
111
        ----------
112
        src : str
113
            Source string for comparison
114
        tar : str
115
            Target string for comparison
116
117 1
        Returns
118 1
        -------
119
        float
120 1
            Strcmp95 similarity
121
122
        Examples
123
        --------
124
        >>> cmp = Strcmp95()
125
        >>> cmp.sim('cat', 'hat')
126
        0.7777777777777777
127
        >>> cmp.sim('Niall', 'Neil')
128
        0.8454999999999999
129
        >>> cmp.sim('aluminum', 'Catalan')
130
        0.6547619047619048
131
        >>> cmp.sim('ATCG', 'TAGC')
132
        0.8333333333333334
133
134
135
        .. versionadded:: 0.1.0
136
        .. versionchanged:: 0.3.6
137
            Encapsulated in class
138
139
        """
140
141
        def _in_range(char: str) -> bool:
142
            """Return True if char is in the range (0, 91).
143
144
            Parameters
145
            ----------
146
            char : str
147
                The character to check
148
149
            Returns
150
            -------
151
            bool
152
                True if char is in the range (0, 91)
153
154 1
            .. versionadded:: 0.1.0
155
156
            """
157
            return 91 > ord(char) > 0
158
159
        ying = src.strip().upper()
160
        yang = tar.strip().upper()
161
162
        if ying == yang:
163
            return 1.0
164
        # If either string is blank - return - added in Version 2
165
        if not ying or not yang:
166
            return 0.0
167
168
        adjwt = defaultdict(int)  # type: DefaultDict[Tuple[str, str], int]
169
170 1
        # Initialize the adjwt array on the first call to the function only.
171
        # The adjwt array is used to give partial credit for characters that
172 1
        # may be errors due to known phonetic or character recognition errors.
173 1
        # A typical example is to match the letter "O" with the number "0"
174
        for tup in self._sp_mx:
175 1
            adjwt[(tup[0], tup[1])] = 3
176 1
            adjwt[(tup[1], tup[0])] = 3
177
178 1
        if len(ying) > len(yang):
179 1
            search_range = len(ying)
180
            minv = len(yang)
181 1
        else:
182
            search_range = len(yang)
183
            minv = len(ying)
184
185
        # Blank out the flags
186
        ying_flag = [0] * search_range
187 1
        yang_flag = [0] * search_range
188 1
        search_range = max(0, search_range // 2 - 1)
189 1
190
        # Looking only within the search range,
191 1
        # count and flag the matched pairs.
192 1
        num_com = 0
193 1
        yl1 = len(yang) - 1
194
        for i in range(len(ying)):
195 1
            low_lim = (i - search_range) if (i >= search_range) else 0
196 1
            hi_lim = (i + search_range) if ((i + search_range) <= yl1) else yl1
197
            for j in range(low_lim, hi_lim + 1):
198
                if (yang_flag[j] == 0) and (yang[j] == ying[i]):
199 1
                    yang_flag[j] = 1
200 1
                    ying_flag[i] = 1
201 1
                    num_com += 1
202
                    break
203
204
        # If no characters in common - return
205 1
        if num_com == 0:
206 1
            return 0.0
207 1
208 1
        # Count the number of transpositions
209 1
        k = n_trans = 0
210 1
        for i in range(len(ying)):
211 1
            if ying_flag[i] != 0:
212 1
                j = 0
213 1
                for j in range(k, len(yang)):  # pragma: no branch
214 1
                    if yang_flag[j] != 0:
215 1
                        k = j + 1
216
                        break
217
                if ying[i] != yang[j]:
218 1
                    n_trans += 1
219 1
        n_trans //= 2
220
221
        # Adjust for similarities in unmatched characters
222 1
        n_simi = 0
223 1
        if minv > num_com:
224 1
            for i in range(len(ying)):
225 1
                if ying_flag[i] == 0 and _in_range(ying[i]):
226 1
                    for j in range(len(yang)):
227 1
                        if yang_flag[j] == 0 and _in_range(yang[j]):
228 1
                            if (ying[i], yang[j]) in adjwt:
229 1
                                n_simi += adjwt[(ying[i], yang[j])]
230 1
                                yang_flag[j] = 2
231 1
                                break
232 1
        num_sim = n_simi / 10.0 + num_com
233
234
        # Main weight computation
235 1
        weight = (
236 1
            num_sim / len(ying)
237 1
            + num_sim / len(yang)
238 1
            + (num_com - n_trans) / num_com
239 1
        )
240 1
        weight /= 3.0
241 1
242 1
        # Continue to boost the weight if the strings are similar
243 1
        if weight > 0.7:
244 1
245 1
            # Adjust for having up to the first 4 characters in common
246
            j = 4 if (minv >= 4) else minv
247
            i = 0
248 1
            while (i < j) and (ying[i] == yang[i]) and (not ying[i].isdigit()):
249
                i += 1
250
            if i:
251
                weight += i * 0.1 * (1.0 - weight)
252
253 1
            # Optionally adjust for long strings.
254
255
            # After agreeing beginning chars, at least two more must agree and
256 1
            # the agreeing characters must be > .5 of remaining characters.
257
            if (
258
                self._long_strings
259 1
                and (minv > 4)
260 1
                and (num_com > i + 1)
261 1
                and (2 * num_com >= minv + i)
262 1
            ):
263 1
                if not ying[0].isdigit():
264 1
                    weight += (1.0 - weight) * (
265
                        (num_com - i - 1) / (len(ying) + len(yang) - i * 2 + 2)
266
                    )
267
268
        return weight
269
270 1
271
if __name__ == '__main__':
272
    import doctest
273
274
    doctest.testmod()
275