Passed
Push — master ( 416c2f...9ec382 )
by Chris
01:03 queued 13s
created

abydos.distance._saps.SAPS.sim()   A

Complexity

Conditions 2

Size

Total Lines 41
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 8
CRAP Score 2

Importance

Changes 0
Metric Value
eloc 9
dl 0
loc 41
ccs 8
cts 8
cp 1
rs 9.95
c 0
b 0
f 0
cc 2
nop 3
crap 2
1
# Copyright 2019-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.distance._saps_alignment.
18
19 1
Syllable Alignment Pattern Searching tokenizer
20
"""
21
22
from typing import Any, Callable, List, Optional, Tuple, cast
23
24 1
from numpy import int_ as np_int
25
from numpy import zeros as np_zeros
26
27
from ._distance import _Distance
28
from ..tokenizer import SAPSTokenizer, _Tokenizer
29
30
__all__ = ['SAPS']
31 1
32 1
33
class SAPS(_Distance):
34 1
    """Syllable Alignment Pattern Searching tokenizer.
35
36 1
    This is the alignment and similarity calculation described on p. 917-918 of
37 1
    :cite:`Ruibin:2005`.
38
39 1
    .. versionadded:: 0.4.0
40
    """
41
42 1
    def __init__(
43
        self,
44
        cost: Tuple[int, int, int, int, int, int, int] = (
45
            1,
46
            -1,
47
            -4,
48
            6,
49
            -2,
50
            -1,
51 1
            -3,
52
        ),
53
        normalizer: Callable[[List[float]], float] = max,
54
        tokenizer: Optional[_Tokenizer] = None,
55
        **kwargs: Any
56
    ):
57
        """Initialize SAPS instance.
58
59
        Parameters
60
        ----------
61
        cost : tuple
62
            A 7-tuple representing the cost of the four possible matches:
63
64
                - syllable-internal match
65
                - syllable-internal mis-match
66
                - syllable-initial match or mismatch with syllable-internal
67
                - syllable-initial match
68
                - syllable-initial mis-match
69
                - syllable-internal gap
70
                - syllable-initial gap
71
72
            (by default: (1, -1, -4, 6, -2, -1, -3))
73
        normalizer : function
74
            A function that takes an list and computes a normalization term
75
            by which the edit distance is divided (max by default). Another
76
            good option is the sum function.
77
        **kwargs
78
            Arbitrary keyword arguments
79
80
81
        .. versionadded:: 0.4.0
82
83
        """
84
        super(SAPS, self).__init__(**kwargs)
85 1
        self._s1, self._s2, self._s3, self._s4, self._s5 = cost[:5]
86 1
        self._g1, self._g2 = cost[5:]
87 1
88
        self._normalizer = normalizer
89 1
        if tokenizer is not None:
90 1
            self._tokenizer = tokenizer
91 1
        else:
92
            self._tokenizer = SAPSTokenizer()
93 1
94
    def _s(self, src: str, tar: str) -> int:
95 1
        if src.isupper():
96 1
            if tar.isupper():
97 1
                return self._s4 if src == tar else self._s5
98 1
            else:
99
                return self._s3
100 1
        else:
101
            if tar.islower():
102 1
                return self._s1 if src == tar else self._s2
103 1
            else:
104
                return self._s3
105 1
106
    def _g(self, ch: str) -> int:
107 1
        if ch.isupper():
108 1
            return self._g2
109 1
        else:
110
            return self._g1
111 1
112
    def sim_score(self, src: str, tar: str) -> float:
113 1
        """Return the SAPS similarity between two strings.
114
115
        Parameters
116
        ----------
117
        src : str
118
            Source string for comparison
119
        tar : str
120
            Target string for comparison
121
122
        Returns
123
        -------
124
        int
125
            The SAPS similarity between src & tar
126
127
        Examples
128
        --------
129
        >>> cmp = SAPS()
130
        >>> cmp.sim_score('cat', 'hat')
131
        0
132
        >>> cmp.sim_score('Niall', 'Neil')
133
        3
134
        >>> cmp.sim_score('aluminum', 'Catalan')
135
        -11
136
        >>> cmp.sim_score('ATCG', 'TAGC')
137
        -1
138
        >>> cmp.sim_score('Stevenson', 'Stinson')
139
        16
140
141
142
        .. versionadded:: 0.4.0
143
144
        """
145
        self._tokenizer.tokenize(src)
146 1
        src = ''.join(
147 1
            [_[0].upper() + _[1:].lower() for _ in self._tokenizer.get_list()]
148
        )
149 1
        self._tokenizer.tokenize(tar)
150 1
        tar = ''.join(
151
            [_[0].upper() + _[1:].lower() for _ in self._tokenizer.get_list()]
152 1
        )
153 1
154 1
        d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int)
155 1
        for i in range(len(src)):
156 1
            d_mat[i + 1, 0] = d_mat[i, 0] + self._g(src[i])
157
        for j in range(len(tar)):
158 1
            d_mat[0, j + 1] = d_mat[0, j] + self._g(tar[j])
159 1
160 1
        for i in range(len(src)):
161
            for j in range(len(tar)):
162
                d_mat[i + 1, j + 1] = max(
163
                    d_mat[i, j + 1] + self._g(src[i]),  # ins
164
                    d_mat[i + 1, j] + self._g(tar[j]),  # del
165
                    d_mat[i, j] + self._s(src[i], tar[j]),  # sub/==
166 1
                )
167
168 1
        return cast(float, d_mat[len(src), len(tar)])
169
170
    def sim(self, src: str, tar: str) -> float:
171
        """Return the normalized SAPS similarity between two strings.
172
173
        Parameters
174
        ----------
175
        src : str
176
            Source string for comparison
177
        tar : str
178
            Target string for comparison
179
180
        Returns
181
        -------
182
        float
183
            The normalized SAPS similarity between src & tar
184
185
        Examples
186
        --------
187
        >>> cmp = SAPS()
188
        >>> round(cmp.sim('cat', 'hat'), 12)
189
        0.0
190
        >>> round(cmp.sim('Niall', 'Neil'), 12)
191
        0.2
192
        >>> cmp.sim('aluminum', 'Catalan')
193
        0.0
194
        >>> cmp.sim('ATCG', 'TAGC')
195
        0.0
196
197
198
        .. versionadded:: 0.4.0
199 1
200 1
        """
201 1
        score = self.sim_score(src, tar)
202
        if score <= 0:
203 1
            return 0.0
204 1
205 1
        self._tokenizer.tokenize(src)
206 1
        src_max = sum(5 + len(_) for _ in self._tokenizer.get_list())
207
        self._tokenizer.tokenize(tar)
208 1
        tar_max = sum(5 + len(_) for _ in self._tokenizer.get_list())
209
210
        return score / max(src_max, tar_max)
211
212
213
if __name__ == '__main__':
214
    import doctest
215
216
    doctest.testmod()
217