abydos.distance._guth.Guth._token_at()   A
last analyzed

Complexity

Conditions 3

Size

Total Lines 26
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 3

Importance

Changes 0
Metric Value
eloc 7
dl 0
loc 26
ccs 5
cts 5
cp 1
rs 10
c 0
b 0
f 0
cc 3
nop 3
crap 3
1
# Copyright 2019-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.distance._guth.
18
19 1
Guth matching algorithm
20
"""
21
22
from typing import Any, List, Optional, Union
23
24 1
from ._distance import _Distance
25
from ..tokenizer import QGrams, _Tokenizer
26
27
__all__ = ['Guth']
28
29
30
class Guth(_Distance):
31 1
    r"""Guth matching.
32 1
33
    Guth matching :cite:`Guth:1976` uses a simple positional matching rule list
34 1
    to determine whether two names match. Following the original, the
35
    :meth:`.sim_score` method returns only 1.0 for matching or 0.0 for
36
    non-matching.
37 1
38
    The :math:`.sim` mathod instead penalizes more distant matches and never
39
    outrightly declares two names a non-matching unless no matches can be made
40
    in the two strings.
41
42
    Tokens other than single characters can be matched by specifying a
43
    tokenizer during initialization or setting the qval parameter.
44
45
    .. versionadded:: 0.4.1
46
    """
47
48
    def __init__(
49
        self, tokenizer: Optional[_Tokenizer] = None, **kwargs: Any
50
    ) -> None:
51
        """Initialize Guth instance.
52
53
        Parameters
54
        ----------
55 1
        tokenizer : _Tokenizer
56
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
57
        **kwargs
58
            Arbitrary keyword arguments
59
60
        Other Parameters
61
        ----------------
62
        qval : int
63
            The length of each q-gram. Using this parameter and tokenizer=None
64
            will cause the instance to use the QGram tokenizer with this
65
            q value.
66
67
68
        .. versionadded:: 0.4.1
69
70
        """
71
        super(Guth, self).__init__(**kwargs)
72
73
        self.params['tokenizer'] = tokenizer
74
        if 'qval' in self.params:
75
            self.params['tokenizer'] = QGrams(
76 1
                qval=self.params['qval'], start_stop='$#', skip=0, scaler=None
77
            )
78 1
79 1
    def _token_at(
80 1
        self, name: Union[List[str], str], pos: int
81
    ) -> Optional[str]:
82
        """Return the token of name at position pos.
83
84 1
        Parameters
85
        ----------
86
        name : str or list
87
            A string (or list) from which to return a token
88
        pos : int
89
            The position of the token to return
90
91
        Returns
92
        -------
93
        str
94
            The requested token or None if the position is invalid
95
96
97
        .. versionadded:: 0.4.1
98
99
        """
100
        if pos < 0:
101
            return None
102
        if pos >= len(name):
103 1
            return None
104 1
        return name[pos]
105 1
106 1
    def sim_score(self, src: str, tar: str) -> float:
107 1
        """Return the Guth matching score of two strings.
108
109 1
        Parameters
110
        ----------
111
        src : str
112
            Source string for comparison
113
        tar : str
114
            Target string for comparison
115
116
        Returns
117
        -------
118
        float
119
            Guth matching score (1.0 if matching, otherwise 0.0)
120
121
        Examples
122
        --------
123
        >>> cmp = Guth()
124
        >>> cmp.sim_score('cat', 'hat')
125
        1.0
126
        >>> cmp.sim_score('Niall', 'Neil')
127
        1.0
128
        >>> cmp.sim_score('aluminum', 'Catalan')
129
        0.0
130
        >>> cmp.sim_score('ATCG', 'TAGC')
131
        1.0
132
133
134
        .. versionadded:: 0.4.1
135
136
        """
137
        if src == tar:
138
            return 1.0
139
        if not src or not tar:
140 1
            return 0.0
141 1
142 1
        if self.params['tokenizer']:
143 1
            src = self.params['tokenizer'].tokenize(src).get_list()
144
            tar = self.params['tokenizer'].tokenize(tar).get_list()
145 1
146 1
        for pos in range(len(src)):
147 1
            s = self._token_at(src, pos)
148
            if s and s in set(tar[max(0, pos - 1) : pos + 3]):
149 1
                continue
150 1
151 1
            t = self._token_at(tar, pos)
152 1
            if t and t in set(src[max(0, pos - 1) : pos + 3]):
153 1
                continue
154
155 1
            s = self._token_at(src, pos + 1)
156 1
            t = self._token_at(tar, pos + 1)
157 1
            if s and t and s == t:
158 1
                continue
159
160 1
            s = self._token_at(src, pos + 2)
161 1
            t = self._token_at(tar, pos + 2)
162 1
            if s and t and s == t:
163 1
                continue
164
165 1
            break
166 1
        else:
167 1
            return 1.0
168 1
        return 0.0
169
170 1
    def sim(self, src: str, tar: str) -> float:
171
        """Return the relative Guth similarity of two strings.
172 1
173 1
        This deviates from the algorithm described in :cite:`Guth:1976` in that
174
        more distant matches are penalized, so that less similar terms score
175 1
        lower that more similar terms.
176
177
        If no match is found for a particular token in the source string, this
178
        does not result in an automatic 0.0 score. Rather, the score is further
179
        penalized towards 0.0.
180
181
        Parameters
182
        ----------
183
        src : str
184
            Source string for comparison
185
        tar : str
186
            Target string for comparison
187
188
        Returns
189
        -------
190
        float
191
            Relative Guth matching score
192
193
        Examples
194
        --------
195
        >>> cmp = Guth()
196
        >>> cmp.sim('cat', 'hat')
197
        0.8666666666666667
198
        >>> cmp.sim('Niall', 'Neil')
199
        0.8800000000000001
200
        >>> cmp.sim('aluminum', 'Catalan')
201
        0.4
202
        >>> cmp.sim('ATCG', 'TAGC')
203
        0.8
204
205
206
        .. versionadded:: 0.4.1
207
208
        """
209
        if src == tar:
210
            return 1.0
211
        if not src or not tar:
212
            return 0.0
213
214 1
        if self.params['tokenizer']:
215 1
            src = self.params['tokenizer'].tokenize(src).get_list()
216 1
            tar = self.params['tokenizer'].tokenize(tar).get_list()
217 1
218
        score = 0.0
219 1
        for pos in range(len(src)):
220 1
            s = self._token_at(src, pos)
221 1
            t = self._token_at(tar, pos)
222
            if s and t and s == t:
223 1
                score += 1.0
224 1
                continue
225 1
226 1
            t = self._token_at(tar, pos + 1)
227 1
            if s and t and s == t:
228 1
                score += 0.8
229 1
                continue
230
231 1
            t = self._token_at(tar, pos + 2)
232 1
            if s and t and s == t:
233 1
                score += 0.6
234 1
                continue
235
236 1
            t = self._token_at(tar, pos - 1)
237 1
            if s and t and s == t:
238 1
                score += 0.8
239 1
                continue
240
241 1
            s = self._token_at(src, pos - 1)
242 1
            t = self._token_at(tar, pos)
243 1
            if s and t and s == t:
244 1
                score += 0.8
245
                continue
246 1
247 1
            s = self._token_at(src, pos + 1)
248 1
            if s and t and s == t:
249 1
                score += 0.8
250 1
                continue
251
252 1
            s = self._token_at(src, pos + 2)
253 1
            if s and t and s == t:
254 1
                score += 0.6
255 1
                continue
256
257 1
            s = self._token_at(src, pos + 1)
258 1
            t = self._token_at(tar, pos + 1)
259 1
            if s and t and s == t:
260 1
                score += 0.6
261
                continue
262 1
263 1
            s = self._token_at(src, pos + 2)
264 1
            t = self._token_at(tar, pos + 2)
265 1
            if s and t and s == t:
266 1
                score += 0.2
267
                continue
268 1
269 1
        return score / len(src)
270 1
271 1
272 1
if __name__ == '__main__':
273
    import doctest
274 1
275
    doctest.testmod()
276