abydos.distance._covington.Covington.alignment()   A
last analyzed

Complexity

Conditions 1

Size

Total Lines 32
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 3
dl 0
loc 32
ccs 3
cts 3
cp 1
rs 10
c 0
b 0
f 0
cc 1
nop 3
crap 1
1
# Copyright 2019-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.distance._covington.
18
19 1
Covington distance
20
"""
21
22
from collections import namedtuple
23
from typing import Any, List, Optional, Tuple, cast
24 1
from unicodedata import normalize as unicode_normalize
25
26
from ._distance import _Distance
27
28
__all__ = ['Covington']
29
30
Alignment = namedtuple('Alignment', ['src', 'tar', 'score'])
31 1
32 1
33
class Covington(_Distance):
34 1
    r"""Covington distance.
35
36 1
    Covington distance :cite:`Covington:1996`
37
38 1
    .. versionadded:: 0.4.0
39
    """
40
41 1
    def __init__(
42
        self,
43
        weights: Tuple[int, int, int, int, int, int, int, int] = (
44
            0,
45
            5,
46
            10,
47
            30,
48
            60,
49 1
            100,
50
            40,
51
            50,
52
        ),
53
        **kwargs: Any
54
    ) -> None:
55
        """Initialize Covington instance.
56
57
        Parameters
58
        ----------
59
        weights : tuple
60
            An 8-tuple of costs for each kind of match or mismatch described in
61
            Covington's paper:
62
63
                - exact consonant or glide match
64
                - exact vowel match
65
                - vowel-vowel length mismatch or i and y or u and w
66
                - vowel-vowel mismatch
67
                - consonant-consonant mismatch
68
                - consonant-vowel mismatch
69
                - skip preceded by a skip
70
                - skip not preceded by a skip
71
72
            The weights used in Covington's first approximation can be used
73
            by supplying the tuple (0.0, 0.0, 0.5, 0.5, 0.5, 1.0, 0.5, 0.5)
74
        **kwargs
75
            Arbitrary keyword arguments
76 1
77 1
78 1
        .. versionadded:: 0.4.0
79 1
80 1
        """
81
        super(Covington, self).__init__(**kwargs)
82 1
        self._weights = weights
83
        self._vowels = set('aeiou')
84
        self._consonants = set('bcdfghjklmnpqrstvxz')
85
        self._glides = set('wy')
86
87
    def dist_abs(self, src: str, tar: str) -> float:
88
        """Return the Covington distance of two strings.
89
90
        Parameters
91
        ----------
92
        src : str
93
            Source string for comparison
94
        tar : str
95
            Target string for comparison
96
97
        Returns
98
        -------
99
        float
100
            Covington distance
101
102
        Examples
103
        --------
104
        >>> cmp = Covington()
105
        >>> cmp.dist_abs('cat', 'hat')
106
        65
107
        >>> cmp.dist_abs('Niall', 'Neil')
108
        115
109
        >>> cmp.dist_abs('aluminum', 'Catalan')
110
        325
111
        >>> cmp.dist_abs('ATCG', 'TAGC')
112
        200
113 1
114
115 1
        .. versionadded:: 0.4.0
116
117
        """
118
        return cast(float, self.alignments(src, tar, 1)[0][-1])
119
120
    def dist(self, src: str, tar: str) -> float:
121
        """Return the normalized Covington distance of two strings.
122
123
        Parameters
124
        ----------
125
        src : str
126
            Source string for comparison
127
        tar : str
128
            Target string for comparison
129
130
        Returns
131
        -------
132
        float
133
            Normalized Covington distance
134
135
        Examples
136
        --------
137
        >>> cmp = Covington()
138
        >>> cmp.dist('cat', 'hat')
139
        0.19117647058823528
140
        >>> cmp.dist('Niall', 'Neil')
141
        0.25555555555555554
142
        >>> cmp.dist('aluminum', 'Catalan')
143
        0.43333333333333335
144
        >>> cmp.dist('ATCG', 'TAGC')
145
        0.45454545454545453
146 1
147 1
148 1
        .. versionadded:: 0.4.0
149 1
150
        """
151 1
        normalizer = self._weights[5] * min(len(src), len(tar))
152
        if len(src) != len(tar):
153 1
            normalizer += self._weights[7]
154
        normalizer += self._weights[6] * abs(abs(len(src) - len(tar)) - 1)
155
156
        return self.dist_abs(src, tar) / normalizer
157
158
    def alignment(self, src: str, tar: str) -> Tuple[float, str, str]:
159
        """Return the top Covington alignment of two strings.
160
161
        This returns only the top alignment in a standard
162
        (score, source alignment, target alignment) tuple format.
163
164
        Parameters
165
        ----------
166
        src : str
167
            Source string for comparison
168
        tar : str
169
            Target string for comparison
170
171
        Returns
172
        -------
173
        tuple(float, str, str)
174
            Covington score & alignment
175
176
        Examples
177
        --------
178
        >>> cmp = Covington()
179
        >>> cmp.alignment('hart', 'kordis')
180
        (240, 'hart--', 'kordis')
181
        >>> cmp.alignment('niy', 'genu')
182
        (170, '--niy', 'genu-')
183 1
184 1
185
        .. versionadded:: 0.4.1
186 1
187
        """
188
        alignment = self.alignments(src, tar, 1)[0]
189
        return alignment.score, alignment.src, alignment.tar
190
191
    def alignments(
192
        self, src: str, tar: str, top_n: Optional[int] = None
193
    ) -> List[Alignment]:
194
        """Return the Covington alignments of two strings.
195
196
        Parameters
197
        ----------
198
        src : str
199
            Source string for comparison
200
        tar : str
201
            Target string for comparison
202
        top_n : int
203
            The number of alignments to return. If None, all alignments will
204
            be returned. If 0, all alignments with the top score will be
205
            returned.
206
207
        Returns
208
        -------
209
        list
210
            Covington alignments
211
212
        Examples
213
        --------
214
        >>> cmp = Covington()
215
        >>> cmp.alignments('hart', 'kordis', top_n=1)[0]
216
        Alignment(src='hart--', tar='kordis', score=240)
217 1
        >>> cmp.alignments('niy', 'genu', top_n=1)[0]
218 1
        Alignment(src='--niy', tar='genu-', score=170)
219 1
220 1
221
        .. versionadded:: 0.4.0
222
223
        """
224
        if not src:
225
            if not tar:
226
                return [Alignment('', '', 0)]
227 1
            return [
228 1
                Alignment(
229
                    '-' * len(tar),
230
                    src,
231
                    self._weights[7] + self._weights[6] * (len(tar) - 1),
232
                )
233
            ]
234
        if not tar:
235
            return [
236 1
                Alignment(
237
                    src,
238 1
                    '-' * len(src),
239 1
                    self._weights[7] + self._weights[6] * (len(src) - 1),
240 1
                )
241 1
            ]
242
243 1
        terminals = []
244 1
245 1
        def _cost(s: str, t: str) -> float:
246 1
            if s[-1:] == '-':
247
                if s[-2:] == '--':
248 1
                    return self._weights[6]
249
                else:
250 1
                    return self._weights[7]
251 1
            elif t[-1:] == '-':
252
                if t[-2:] == '--':
253 1
                    return self._weights[6]
254 1
                else:
255 1
                    return self._weights[7]
256
257 1
            s = unicode_normalize('NFC', s)[-1:]
258
            t = unicode_normalize('NFC', t)[-1:]
259 1
260 1
            if s == t:
261
                if s in self._consonants or s in self._glides:
262 1
                    return self._weights[0]
263 1
                else:
264
                    return self._weights[1]
265 1
266 1
            if ''.join(sorted([s, t])) in {'iy', 'uw'}:
267
                return self._weights[2]
268 1
269 1
            sd = unicode_normalize('NFKD', s)
270 1
            td = unicode_normalize('NFKD', t)
271 1
272
            if sd[0] == td[0] and s in self._vowels:
273 1
                return self._weights[2]
274
275 1
            if sd[0] in self._vowels and td[0] in self._vowels:
276 1
                return self._weights[3]
277
            if sd[0] in self._consonants and td[0] in self._consonants:
278 1
                return self._weights[4]
279 1
280
            return self._weights[5]
281
282
        def _add_alignments(
283
            cost: float, src: str, tar: str, src_align: str, tar_align: str
284
        ) -> None:
285
            cost += _cost(src_align, tar_align)
286 1
287 1
            if src and tar:
288
                _add_alignments(
289
                    cost,
290 1
                    src[1:],
291 1
                    tar[1:],
292
                    src_align + src[0],
293
                    tar_align + tar[0],
294
                )
295 1
            if tar and tar_align[-1] != '-':
296 1
                _add_alignments(
297
                    cost, src, tar[1:], src_align + '-', tar_align + tar[0]
298 1
                )
299
            if src and src_align[-1] != '-':
300 1
                _add_alignments(
301 1
                    cost, src[1:], tar, src_align + src[0], tar_align + '-'
302 1
                )
303
304 1
            if not src and not tar:
305 1
                terminals.append(Alignment(src_align, tar_align, cost))
306
307 1
            return
308
309 1
        _add_alignments(0, src, tar[1:], '-', tar[0])
310 1
        _add_alignments(0, src[1:], tar, src[0], '-')
311 1
        _add_alignments(0, src[1:], tar[1:], src[0], tar[0])
312 1
313
        terminals = sorted(terminals, key=lambda al: al.score)
314
315 1
        if top_n == 0:
316
            top_score = terminals[0].score
317 1
            top_n = 1
318 1
            while (
319
                top_n < len(terminals) and terminals[top_n].score == top_score
320 1
            ):
321
                top_n += 1
322
323
        if top_n is None:
324
            return terminals
325
        else:
326
            return terminals[:top_n]
327
328
329
if __name__ == '__main__':
330
    import doctest
331
332
    doctest.testmod()
333