abydos.distance._covington.Covington.alignment() - Code Metrics - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

abydos.distance._covington.Covington.alignment() A
last analyzed 2020-12-31 20:10 UTC

↳ Parent: abydos.distance._covington

Complexity

Conditions

Size

Total Lines	32
Code Lines	3

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	3
CRAP Score	1

Importance

Changes

Metric	Value
eloc	3
dl	0
loc	32
ccs	3
cts	3
cp	1
rs	10
c	0
b	0
f	0
cc	1
nop	3
crap	1

# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._covington.

Covington distance
"""

from collections import namedtuple
from typing import Any, List, Optional, Tuple, cast
from unicodedata import normalize as unicode_normalize

from ._distance import _Distance

__all__ = ['Covington']

Alignment = namedtuple('Alignment', ['src', 'tar', 'score'])


class Covington(_Distance):
    r"""Covington distance.

    Covington distance :cite:`Covington:1996`

    .. versionadded:: 0.4.0
    """

    def __init__(
        self,
        weights: Tuple[int, int, int, int, int, int, int, int] = (
            0,
            5,
            10,
            30,
            60,
            100,
            40,
            50,
        ),
        **kwargs: Any
    ) -> None:
        """Initialize Covington instance.

        Parameters
        ----------
        weights : tuple
            An 8-tuple of costs for each kind of match or mismatch described in
            Covington's paper:

                - exact consonant or glide match
                - exact vowel match
                - vowel-vowel length mismatch or i and y or u and w
                - vowel-vowel mismatch
                - consonant-consonant mismatch
                - consonant-vowel mismatch
                - skip preceded by a skip
                - skip not preceded by a skip

            The weights used in Covington's first approximation can be used
            by supplying the tuple (0.0, 0.0, 0.5, 0.5, 0.5, 1.0, 0.5, 0.5)
        **kwargs
            Arbitrary keyword arguments


        .. versionadded:: 0.4.0

        """
        super(Covington, self).__init__(**kwargs)
        self._weights = weights
        self._vowels = set('aeiou')
        self._consonants = set('bcdfghjklmnpqrstvxz')
        self._glides = set('wy')

    def dist_abs(self, src: str, tar: str) -> float:
        """Return the Covington distance of two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            Covington distance

        Examples
        --------
        >>> cmp = Covington()
        >>> cmp.dist_abs('cat', 'hat')
        65
        >>> cmp.dist_abs('Niall', 'Neil')
        115
        >>> cmp.dist_abs('aluminum', 'Catalan')
        325
        >>> cmp.dist_abs('ATCG', 'TAGC')
        200


        .. versionadded:: 0.4.0

        """
        return cast(float, self.alignments(src, tar, 1)[0][-1])

    def dist(self, src: str, tar: str) -> float:
        """Return the normalized Covington distance of two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            Normalized Covington distance

        Examples
        --------
        >>> cmp = Covington()
        >>> cmp.dist('cat', 'hat')
        0.19117647058823528
        >>> cmp.dist('Niall', 'Neil')
        0.25555555555555554
        >>> cmp.dist('aluminum', 'Catalan')
        0.43333333333333335
        >>> cmp.dist('ATCG', 'TAGC')
        0.45454545454545453


        .. versionadded:: 0.4.0

        """
        normalizer = self._weights[5] * min(len(src), len(tar))
        if len(src) != len(tar):
            normalizer += self._weights[7]
        normalizer += self._weights[6] * abs(abs(len(src) - len(tar)) - 1)

        return self.dist_abs(src, tar) / normalizer

    def alignment(self, src: str, tar: str) -> Tuple[float, str, str]:
        """Return the top Covington alignment of two strings.

        This returns only the top alignment in a standard
        (score, source alignment, target alignment) tuple format.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        tuple(float, str, str)
            Covington score & alignment

        Examples
        --------
        >>> cmp = Covington()
        >>> cmp.alignment('hart', 'kordis')
        (240, 'hart--', 'kordis')
        >>> cmp.alignment('niy', 'genu')
        (170, '--niy', 'genu-')


        .. versionadded:: 0.4.1

        """
        alignment = self.alignments(src, tar, 1)[0]
        return alignment.score, alignment.src, alignment.tar

    def alignments(
        self, src: str, tar: str, top_n: Optional[int] = None
    ) -> List[Alignment]:
        """Return the Covington alignments of two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison
        top_n : int
            The number of alignments to return. If None, all alignments will
            be returned. If 0, all alignments with the top score will be
            returned.

        Returns
        -------
        list
            Covington alignments

        Examples
        --------
        >>> cmp = Covington()
        >>> cmp.alignments('hart', 'kordis', top_n=1)[0]
        Alignment(src='hart--', tar='kordis', score=240)
        >>> cmp.alignments('niy', 'genu', top_n=1)[0]
        Alignment(src='--niy', tar='genu-', score=170)


        .. versionadded:: 0.4.0

        """
        if not src:
            if not tar:
                return [Alignment('', '', 0)]
            return [
                Alignment(
                    '-' * len(tar),
                    src,
                    self._weights[7] + self._weights[6] * (len(tar) - 1),
                )
            ]
        if not tar:
            return [
                Alignment(
                    src,
                    '-' * len(src),
                    self._weights[7] + self._weights[6] * (len(src) - 1),
                )
            ]

        terminals = []

        def _cost(s: str, t: str) -> float:
            if s[-1:] == '-':
                if s[-2:] == '--':
                    return self._weights[6]
                else:
                    return self._weights[7]
            elif t[-1:] == '-':
                if t[-2:] == '--':
                    return self._weights[6]
                else:
                    return self._weights[7]

            s = unicode_normalize('NFC', s)[-1:]
            t = unicode_normalize('NFC', t)[-1:]

            if s == t:
                if s in self._consonants or s in self._glides:
                    return self._weights[0]
                else:
                    return self._weights[1]

            if ''.join(sorted([s, t])) in {'iy', 'uw'}:
                return self._weights[2]

            sd = unicode_normalize('NFKD', s)
            td = unicode_normalize('NFKD', t)

            if sd[0] == td[0] and s in self._vowels:
                return self._weights[2]

            if sd[0] in self._vowels and td[0] in self._vowels:
                return self._weights[3]
            if sd[0] in self._consonants and td[0] in self._consonants:
                return self._weights[4]

            return self._weights[5]

        def _add_alignments(
            cost: float, src: str, tar: str, src_align: str, tar_align: str
        ) -> None:
            cost += _cost(src_align, tar_align)

            if src and tar:
                _add_alignments(
                    cost,
                    src[1:],
                    tar[1:],
                    src_align + src[0],
                    tar_align + tar[0],
                )
            if tar and tar_align[-1] != '-':
                _add_alignments(
                    cost, src, tar[1:], src_align + '-', tar_align + tar[0]
                )
            if src and src_align[-1] != '-':
                _add_alignments(
                    cost, src[1:], tar, src_align + src[0], tar_align + '-'
                )

            if not src and not tar:
                terminals.append(Alignment(src_align, tar_align, cost))

            return

        _add_alignments(0, src, tar[1:], '-', tar[0])
        _add_alignments(0, src[1:], tar, src[0], '-')
        _add_alignments(0, src[1:], tar[1:], src[0], tar[0])

        terminals = sorted(terminals, key=lambda al: al.score)

        if top_n == 0:
            top_score = terminals[0].score
            top_n = 1
            while (
                top_n < len(terminals) and terminals[top_n].score == top_score
            ):
                top_n += 1

        if top_n is None:
            return terminals
        else:
            return terminals[:top_n]


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# Copyright 2019-2020 by Christopher C. Little.
2		# This file is part of Abydos.
3		#
4		# Abydos is free software: you can redistribute it and/or modify
5		# it under the terms of the GNU General Public License as published by
6		# the Free Software Foundation, either version 3 of the License, or
7		# (at your option) any later version.
8		#
9		# Abydos is distributed in the hope that it will be useful,
10		# but WITHOUT ANY WARRANTY; without even the implied warranty of
11		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12		# GNU General Public License for more details.
13		#
14		# You should have received a copy of the GNU General Public License
15		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17		"""abydos.distance._covington.
18
19	1	Covington distance
20		"""
21
22		from collections import namedtuple
23		from typing import Any, List, Optional, Tuple, cast
24	1	from unicodedata import normalize as unicode_normalize
25
26		from ._distance import _Distance
27
28		__all__ = ['Covington']
29
30		Alignment = namedtuple('Alignment', ['src', 'tar', 'score'])
31	1
32	1
33		class Covington(_Distance):
34	1	r"""Covington distance.
35
36	1	Covington distance :cite:`Covington:1996`
37
38	1	.. versionadded:: 0.4.0
39		"""
40
41	1	def __init__(
42		self,
43		weights: Tuple[int, int, int, int, int, int, int, int] = (
44		0,
45		5,
46		10,
47		30,
48		60,
49	1	100,
50		40,
51		50,
52		),
53		**kwargs: Any
54		) -> None:
55		"""Initialize Covington instance.
56
57		Parameters
58		----------
59		weights : tuple
60		An 8-tuple of costs for each kind of match or mismatch described in
61		Covington's paper:
62
63		- exact consonant or glide match
64		- exact vowel match
65		- vowel-vowel length mismatch or i and y or u and w
66		- vowel-vowel mismatch
67		- consonant-consonant mismatch
68		- consonant-vowel mismatch
69		- skip preceded by a skip
70		- skip not preceded by a skip
71
72		The weights used in Covington's first approximation can be used
73		by supplying the tuple (0.0, 0.0, 0.5, 0.5, 0.5, 1.0, 0.5, 0.5)
74		**kwargs
75		Arbitrary keyword arguments
76	1
77	1
78	1	.. versionadded:: 0.4.0
79	1
80	1	"""
81		super(Covington, self).__init__(**kwargs)
82	1	self._weights = weights
83		self._vowels = set('aeiou')
84		self._consonants = set('bcdfghjklmnpqrstvxz')
85		self._glides = set('wy')
86
87		def dist_abs(self, src: str, tar: str) -> float:
88		"""Return the Covington distance of two strings.
89
90		Parameters
91		----------
92		src : str
93		Source string for comparison
94		tar : str
95		Target string for comparison
96
97		Returns
98		-------
99		float
100		Covington distance
101
102		Examples
103		--------
104		>>> cmp = Covington()
105		>>> cmp.dist_abs('cat', 'hat')
106		65
107		>>> cmp.dist_abs('Niall', 'Neil')
108		115
109		>>> cmp.dist_abs('aluminum', 'Catalan')
110		325
111		>>> cmp.dist_abs('ATCG', 'TAGC')
112		200
113	1
114
115	1	.. versionadded:: 0.4.0
116
117		"""
118		return cast(float, self.alignments(src, tar, 1)[0][-1])
119
120		def dist(self, src: str, tar: str) -> float:
121		"""Return the normalized Covington distance of two strings.
122
123		Parameters
124		----------
125		src : str
126		Source string for comparison
127		tar : str
128		Target string for comparison
129
130		Returns
131		-------
132		float
133		Normalized Covington distance
134
135		Examples
136		--------
137		>>> cmp = Covington()
138		>>> cmp.dist('cat', 'hat')
139		0.19117647058823528
140		>>> cmp.dist('Niall', 'Neil')
141		0.25555555555555554
142		>>> cmp.dist('aluminum', 'Catalan')
143		0.43333333333333335
144		>>> cmp.dist('ATCG', 'TAGC')
145		0.45454545454545453
146	1
147	1
148	1	.. versionadded:: 0.4.0
149	1
150		"""
151	1	normalizer = self._weights[5] * min(len(src), len(tar))
152		if len(src) != len(tar):
153	1	normalizer += self._weights[7]
154		normalizer += self._weights[6] * abs(abs(len(src) - len(tar)) - 1)
155
156		return self.dist_abs(src, tar) / normalizer
157
158		def alignment(self, src: str, tar: str) -> Tuple[float, str, str]:
159		"""Return the top Covington alignment of two strings.
160
161		This returns only the top alignment in a standard
162		(score, source alignment, target alignment) tuple format.
163
164		Parameters
165		----------
166		src : str
167		Source string for comparison
168		tar : str
169		Target string for comparison
170
171		Returns
172		-------
173		tuple(float, str, str)
174		Covington score & alignment
175
176		Examples
177		--------
178		>>> cmp = Covington()
179		>>> cmp.alignment('hart', 'kordis')
180		(240, 'hart--', 'kordis')
181		>>> cmp.alignment('niy', 'genu')
182		(170, '--niy', 'genu-')
183	1
184	1
185		.. versionadded:: 0.4.1
186	1
187		"""
188		alignment = self.alignments(src, tar, 1)[0]
189		return alignment.score, alignment.src, alignment.tar
190
191		def alignments(
192		self, src: str, tar: str, top_n: Optional[int] = None
193		) -> List[Alignment]:
194		"""Return the Covington alignments of two strings.
195
196		Parameters
197		----------
198		src : str
199		Source string for comparison
200		tar : str
201		Target string for comparison
202		top_n : int
203		The number of alignments to return. If None, all alignments will
204		be returned. If 0, all alignments with the top score will be
205		returned.
206
207		Returns
208		-------
209		list
210		Covington alignments
211
212		Examples
213		--------
214		>>> cmp = Covington()
215		>>> cmp.alignments('hart', 'kordis', top_n=1)[0]
216		Alignment(src='hart--', tar='kordis', score=240)
217	1	>>> cmp.alignments('niy', 'genu', top_n=1)[0]
218	1	Alignment(src='--niy', tar='genu-', score=170)
219	1
220	1
221		.. versionadded:: 0.4.0
222
223		"""
224		if not src:
225		if not tar:
226		return [Alignment('', '', 0)]
227	1	return [
228	1	Alignment(
229		'-' * len(tar),
230		src,
231		self._weights[7] + self._weights[6] * (len(tar) - 1),
232		)
233		]
234		if not tar:
235		return [
236	1	Alignment(
237		src,
238	1	'-' * len(src),
239	1	self._weights[7] + self._weights[6] * (len(src) - 1),
240	1	)
241	1	]
242
243	1	terminals = []
244	1
245	1	def _cost(s: str, t: str) -> float:
246	1	if s[-1:] == '-':
247		if s[-2:] == '--':
248	1	return self._weights[6]
249		else:
250	1	return self._weights[7]
251	1	elif t[-1:] == '-':
252		if t[-2:] == '--':
253	1	return self._weights[6]
254	1	else:
255	1	return self._weights[7]
256
257	1	s = unicode_normalize('NFC', s)[-1:]
258		t = unicode_normalize('NFC', t)[-1:]
259	1
260	1	if s == t:
261		if s in self._consonants or s in self._glides:
262	1	return self._weights[0]
263	1	else:
264		return self._weights[1]
265	1
266	1	if ''.join(sorted([s, t])) in {'iy', 'uw'}:
267		return self._weights[2]
268	1
269	1	sd = unicode_normalize('NFKD', s)
270	1	td = unicode_normalize('NFKD', t)
271	1
272		if sd[0] == td[0] and s in self._vowels:
273	1	return self._weights[2]
274
275	1	if sd[0] in self._vowels and td[0] in self._vowels:
276	1	return self._weights[3]
277		if sd[0] in self._consonants and td[0] in self._consonants:
278	1	return self._weights[4]
279	1
280		return self._weights[5]
281
282		def _add_alignments(
283		cost: float, src: str, tar: str, src_align: str, tar_align: str
284		) -> None:
285		cost += _cost(src_align, tar_align)
286	1
287	1	if src and tar:
288		_add_alignments(
289		cost,
290	1	src[1:],
291	1	tar[1:],
292		src_align + src[0],
293		tar_align + tar[0],
294		)
295	1	if tar and tar_align[-1] != '-':
296	1	_add_alignments(
297		cost, src, tar[1:], src_align + '-', tar_align + tar[0]
298	1	)
299		if src and src_align[-1] != '-':
300	1	_add_alignments(
301	1	cost, src[1:], tar, src_align + src[0], tar_align + '-'
302	1	)
303
304	1	if not src and not tar:
305	1	terminals.append(Alignment(src_align, tar_align, cost))
306
307	1	return
308
309	1	_add_alignments(0, src, tar[1:], '-', tar[0])
310	1	_add_alignments(0, src[1:], tar, src[0], '-')
311	1	_add_alignments(0, src[1:], tar[1:], src[0], tar[0])
312	1
313		terminals = sorted(terminals, key=lambda al: al.score)
314
315	1	if top_n == 0:
316		top_score = terminals[0].score
317	1	top_n = 1
318	1	while (
319		top_n < len(terminals) and terminals[top_n].score == top_score
320	1	):
321		top_n += 1
322
323		if top_n is None:
324		return terminals
325		else:
326		return terminals[:top_n]
327
328
329		if __name__ == '__main__':
330		import doctest
331
332		doctest.testmod()
333

chrislit / abydos

abydos.distance._covington.Covington.alignment() A last analyzed 2020-12-31 20:10 UTC

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like

abydos.distance._covington.Covington.alignment() A
last analyzed 2020-12-31 20:10 UTC