abydos.distance._saps - Code Metrics - Inspection of "Merge pull request #257 from chrislit/0.6.0b" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 416c2f...9ec382 )

by Chris

created 2020-02-10 23:39 UTC

abydos.distance._saps A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	217
Duplicated Lines	0 %

Test Coverage

Coverage

100%

Importance

Changes

Metric	Value
wmc	17
eloc	71
dl	0
loc	217
ccs	52
cts	52
cp	1
rs	10
c	0
b	0
f	0

5 Methods

Rating	Name	Size	Complexity
B	SAPS._s()	11	6
A	SAPS._g()	5	2
A	SAPS.sim()	41	2
A	SAPS.__init__()	51	2
B	SAPS.sim_score()	57	5

# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._saps_alignment.

Syllable Alignment Pattern Searching tokenizer
"""

from typing import Any, Callable, List, Optional, Tuple, cast

from numpy import int_ as np_int
from numpy import zeros as np_zeros

from ._distance import _Distance
from ..tokenizer import SAPSTokenizer, _Tokenizer

__all__ = ['SAPS']


class SAPS(_Distance):
    """Syllable Alignment Pattern Searching tokenizer.

    This is the alignment and similarity calculation described on p. 917-918 of
    :cite:`Ruibin:2005`.

    .. versionadded:: 0.4.0
    """

    def __init__(
        self,
        cost: Tuple[int, int, int, int, int, int, int] = (
            1,
            -1,
            -4,
            6,
            -2,
            -1,
            -3,
        ),
        normalizer: Callable[[List[float]], float] = max,
        tokenizer: Optional[_Tokenizer] = None,
        **kwargs: Any
    ):
        """Initialize SAPS instance.

        Parameters
        ----------
        cost : tuple
            A 7-tuple representing the cost of the four possible matches:

                - syllable-internal match
                - syllable-internal mis-match
                - syllable-initial match or mismatch with syllable-internal
                - syllable-initial match
                - syllable-initial mis-match
                - syllable-internal gap
                - syllable-initial gap

            (by default: (1, -1, -4, 6, -2, -1, -3))
        normalizer : function
            A function that takes an list and computes a normalization term
            by which the edit distance is divided (max by default). Another
            good option is the sum function.
        **kwargs
            Arbitrary keyword arguments


        .. versionadded:: 0.4.0

        """
        super(SAPS, self).__init__(**kwargs)
        self._s1, self._s2, self._s3, self._s4, self._s5 = cost[:5]
        self._g1, self._g2 = cost[5:]

        self._normalizer = normalizer
        if tokenizer is not None:
            self._tokenizer = tokenizer
        else:
            self._tokenizer = SAPSTokenizer()

    def _s(self, src: str, tar: str) -> int:
        if src.isupper():
            if tar.isupper():
                return self._s4 if src == tar else self._s5
            else:
                return self._s3
        else:
            if tar.islower():
                return self._s1 if src == tar else self._s2
            else:
                return self._s3

    def _g(self, ch: str) -> int:
        if ch.isupper():
            return self._g2
        else:
            return self._g1

    def sim_score(self, src: str, tar: str) -> float:
        """Return the SAPS similarity between two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        int
            The SAPS similarity between src & tar

        Examples
        --------
        >>> cmp = SAPS()
        >>> cmp.sim_score('cat', 'hat')
        0
        >>> cmp.sim_score('Niall', 'Neil')
        3
        >>> cmp.sim_score('aluminum', 'Catalan')
        -11
        >>> cmp.sim_score('ATCG', 'TAGC')
        -1
        >>> cmp.sim_score('Stevenson', 'Stinson')
        16


        .. versionadded:: 0.4.0

        """
        self._tokenizer.tokenize(src)
        src = ''.join(
            [_[0].upper() + _[1:].lower() for _ in self._tokenizer.get_list()]
        )
        self._tokenizer.tokenize(tar)
        tar = ''.join(
            [_[0].upper() + _[1:].lower() for _ in self._tokenizer.get_list()]
        )

        d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int)
        for i in range(len(src)):
            d_mat[i + 1, 0] = d_mat[i, 0] + self._g(src[i])
        for j in range(len(tar)):
            d_mat[0, j + 1] = d_mat[0, j] + self._g(tar[j])

        for i in range(len(src)):
            for j in range(len(tar)):
                d_mat[i + 1, j + 1] = max(
                    d_mat[i, j + 1] + self._g(src[i]),  # ins
                    d_mat[i + 1, j] + self._g(tar[j]),  # del
                    d_mat[i, j] + self._s(src[i], tar[j]),  # sub/==
                )

        return cast(float, d_mat[len(src), len(tar)])

    def sim(self, src: str, tar: str) -> float:
        """Return the normalized SAPS similarity between two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            The normalized SAPS similarity between src & tar

        Examples
        --------
        >>> cmp = SAPS()
        >>> round(cmp.sim('cat', 'hat'), 12)
        0.0
        >>> round(cmp.sim('Niall', 'Neil'), 12)
        0.2
        >>> cmp.sim('aluminum', 'Catalan')
        0.0
        >>> cmp.sim('ATCG', 'TAGC')
        0.0


        .. versionadded:: 0.4.0

        """
        score = self.sim_score(src, tar)
        if score <= 0:
            return 0.0

        self._tokenizer.tokenize(src)
        src_max = sum(5 + len(_) for _ in self._tokenizer.get_list())
        self._tokenizer.tokenize(tar)
        tar_max = sum(5 + len(_) for _ in self._tokenizer.get_list())

        return score / max(src_max, tar_max)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# Copyright 2019-2020 by Christopher C. Little.
2		# This file is part of Abydos.
3		#
4		# Abydos is free software: you can redistribute it and/or modify
5		# it under the terms of the GNU General Public License as published by
6		# the Free Software Foundation, either version 3 of the License, or
7		# (at your option) any later version.
8		#
9		# Abydos is distributed in the hope that it will be useful,
10		# but WITHOUT ANY WARRANTY; without even the implied warranty of
11		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12		# GNU General Public License for more details.
13		#
14		# You should have received a copy of the GNU General Public License
15		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17		"""abydos.distance._saps_alignment.
18
19	1	Syllable Alignment Pattern Searching tokenizer
20		"""
21
22		from typing import Any, Callable, List, Optional, Tuple, cast
23
24	1	from numpy import int_ as np_int
25		from numpy import zeros as np_zeros
26
27		from ._distance import _Distance
28		from ..tokenizer import SAPSTokenizer, _Tokenizer
29
30		__all__ = ['SAPS']
31	1
32	1
33		class SAPS(_Distance):
34	1	"""Syllable Alignment Pattern Searching tokenizer.
35
36	1	This is the alignment and similarity calculation described on p. 917-918 of
37	1	:cite:`Ruibin:2005`.
38
39	1	.. versionadded:: 0.4.0
40		"""
41
42	1	def __init__(
43		self,
44		cost: Tuple[int, int, int, int, int, int, int] = (
45		1,
46		-1,
47		-4,
48		6,
49		-2,
50		-1,
51	1	-3,
52		),
53		normalizer: Callable[[List[float]], float] = max,
54		tokenizer: Optional[_Tokenizer] = None,
55		**kwargs: Any
56		):
57		"""Initialize SAPS instance.
58
59		Parameters
60		----------
61		cost : tuple
62		A 7-tuple representing the cost of the four possible matches:
63
64		- syllable-internal match
65		- syllable-internal mis-match
66		- syllable-initial match or mismatch with syllable-internal
67		- syllable-initial match
68		- syllable-initial mis-match
69		- syllable-internal gap
70		- syllable-initial gap
71
72		(by default: (1, -1, -4, 6, -2, -1, -3))
73		normalizer : function
74		A function that takes an list and computes a normalization term
75		by which the edit distance is divided (max by default). Another
76		good option is the sum function.
77		**kwargs
78		Arbitrary keyword arguments
79
80
81		.. versionadded:: 0.4.0
82
83		"""
84		super(SAPS, self).__init__(**kwargs)
85	1	self._s1, self._s2, self._s3, self._s4, self._s5 = cost[:5]
86	1	self._g1, self._g2 = cost[5:]
87	1
88		self._normalizer = normalizer
89	1	if tokenizer is not None:
90	1	self._tokenizer = tokenizer
91	1	else:
92		self._tokenizer = SAPSTokenizer()
93	1
94		def _s(self, src: str, tar: str) -> int:
95	1	if src.isupper():
96	1	if tar.isupper():
97	1	return self._s4 if src == tar else self._s5
98	1	else:
99		return self._s3
100	1	else:
101		if tar.islower():
102	1	return self._s1 if src == tar else self._s2
103	1	else:
104		return self._s3
105	1
106		def _g(self, ch: str) -> int:
107	1	if ch.isupper():
108	1	return self._g2
109	1	else:
110		return self._g1
111	1
112		def sim_score(self, src: str, tar: str) -> float:
113	1	"""Return the SAPS similarity between two strings.
114
115		Parameters
116		----------
117		src : str
118		Source string for comparison
119		tar : str
120		Target string for comparison
121
122		Returns
123		-------
124		int
125		The SAPS similarity between src & tar
126
127		Examples
128		--------
129		>>> cmp = SAPS()
130		>>> cmp.sim_score('cat', 'hat')
131		0
132		>>> cmp.sim_score('Niall', 'Neil')
133		3
134		>>> cmp.sim_score('aluminum', 'Catalan')
135		-11
136		>>> cmp.sim_score('ATCG', 'TAGC')
137		-1
138		>>> cmp.sim_score('Stevenson', 'Stinson')
139		16
140
141
142		.. versionadded:: 0.4.0
143
144		"""
145		self._tokenizer.tokenize(src)
146	1	src = ''.join(
147	1	[_[0].upper() + _[1:].lower() for _ in self._tokenizer.get_list()]
148		)
149	1	self._tokenizer.tokenize(tar)
150	1	tar = ''.join(
151		[_[0].upper() + _[1:].lower() for _ in self._tokenizer.get_list()]
152	1	)
153	1
154	1	d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int)
155	1	for i in range(len(src)):
156	1	d_mat[i + 1, 0] = d_mat[i, 0] + self._g(src[i])
157		for j in range(len(tar)):
158	1	d_mat[0, j + 1] = d_mat[0, j] + self._g(tar[j])
159	1
160	1	for i in range(len(src)):
161		for j in range(len(tar)):
162		d_mat[i + 1, j + 1] = max(
163		d_mat[i, j + 1] + self._g(src[i]), # ins
164		d_mat[i + 1, j] + self._g(tar[j]), # del
165		d_mat[i, j] + self._s(src[i], tar[j]), # sub/==
166	1	)
167
168	1	return cast(float, d_mat[len(src), len(tar)])
169
170		def sim(self, src: str, tar: str) -> float:
171		"""Return the normalized SAPS similarity between two strings.
172
173		Parameters
174		----------
175		src : str
176		Source string for comparison
177		tar : str
178		Target string for comparison
179
180		Returns
181		-------
182		float
183		The normalized SAPS similarity between src & tar
184
185		Examples
186		--------
187		>>> cmp = SAPS()
188		>>> round(cmp.sim('cat', 'hat'), 12)
189		0.0
190		>>> round(cmp.sim('Niall', 'Neil'), 12)
191		0.2
192		>>> cmp.sim('aluminum', 'Catalan')
193		0.0
194		>>> cmp.sim('ATCG', 'TAGC')
195		0.0
196
197
198		.. versionadded:: 0.4.0
199	1
200	1	"""
201	1	score = self.sim_score(src, tar)
202		if score <= 0:
203	1	return 0.0
204	1
205	1	self._tokenizer.tokenize(src)
206	1	src_max = sum(5 + len(_) for _ in self._tokenizer.get_list())
207		self._tokenizer.tokenize(tar)
208	1	tar_max = sum(5 + len(_) for _ in self._tokenizer.get_list())
209
210		return score / max(src_max, tar_max)
211
212
213		if __name__ == '__main__':
214		import doctest
215
216		doctest.testmod()
217

chrislit / abydos

Push — master ( 416c2f...9ec382 )

abydos.distance._saps A

Complexity

Size/Duplication

Test Coverage

Importance

5 Methods

Duplication Side-by-Side

Filter issues like