abydos.distance._sift4   A
last analyzed

Complexity

Total Complexity 27

Size/Duplication

Total Lines 215
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
wmc 27
eloc 79
dl 0
loc 215
ccs 72
cts 72
cp 1
rs 10
c 0
b 0
f 0

3 Methods

Rating   Name   Duplication   Size   Complexity  
A Sift4.__init__() 0 21 1
F Sift4.dist_abs() 0 112 25
A Sift4.dist() 0 36 1
1
# Copyright 2018-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.distance._sift4.
18
19 1
Sift4 Common approximate string distance
20
"""
21
22
from typing import Any, Dict, List, Union
23
24 1
from ._distance import _Distance
25
26
__all__ = ['Sift4']
27
28
29
class Sift4(_Distance):
30
    """Sift4 Common version.
31 1
32
    This is an approximation of edit distance, described in
33 1
    :cite:`Zackwehdex:2014`.
34
35 1
    .. versionadded:: 0.3.6
36 1
    """
37
38 1
    def __init__(
39
        self, max_offset: int = 5, max_distance: int = 0, **kwargs: Any
40
    ) -> None:
41 1
        """Initialize Sift4 instance.
42
43
        Parameters
44
        ----------
45
        max_offset : int
46
            The number of characters to search for matching letters
47
        max_distance : int
48
            The distance at which to stop and exit
49
        **kwargs
50 1
            Arbitrary keyword arguments
51
52
53
        .. versionadded:: 0.4.0
54
55
        """
56
        super(Sift4, self).__init__(**kwargs)
57
        self._max_offset = max_offset
58
        self._max_distance = max_distance
59
60
    def dist_abs(self, src: str, tar: str) -> float:
61
        """Return the "common" Sift4 distance between two terms.
62
63
        Parameters
64
        ----------
65
        src : str
66 1
            Source string for comparison
67 1
        tar : str
68 1
            Target string for comparison
69
70 1
        Returns
71
        -------
72
        int
73
            The Sift4 distance according to the common formula
74
75
        Examples
76
        --------
77
        >>> cmp = Sift4()
78
        >>> cmp.dist_abs('cat', 'hat')
79
        1
80
        >>> cmp.dist_abs('Niall', 'Neil')
81
        2
82
        >>> cmp.dist_abs('Colin', 'Cuilen')
83
        3
84
        >>> cmp.dist_abs('ATCG', 'TAGC')
85
        2
86
87
88
        .. versionadded:: 0.3.0
89
        .. versionchanged:: 0.3.6
90
            Encapsulated in class
91
92
        """
93
        if not src:
94
            return len(tar)
95
96
        if not tar:
97
            return len(src)
98
99
        src_len = len(src)
100
        tar_len = len(tar)
101
102
        src_cur = 0
103 1
        tar_cur = 0
104 1
        lcss = 0
105
        local_cs = 0
106 1
        trans = 0
107 1
        offset_arr = []  # type: List[Dict[str, Union[int, bool]]]
108
109 1
        while (src_cur < src_len) and (tar_cur < tar_len):
110 1
            if src[src_cur] == tar[tar_cur]:
111
                local_cs += 1
112 1
                is_trans = False
113 1
                i = 0
114 1
                while i < len(offset_arr):
115 1
                    ofs = offset_arr[i]
116 1
                    if src_cur <= ofs['src_cur'] or tar_cur <= ofs['tar_cur']:
117 1
                        is_trans = abs(tar_cur - src_cur) >= abs(
118
                            ofs['tar_cur'] - ofs['src_cur']
119 1
                        )
120 1
                        if is_trans:
121 1
                            trans += 1
122 1
                        elif not ofs['trans']:
123 1
                            ofs['trans'] = True
124 1
                            trans += 1
125 1
                        break
126 1
                    elif src_cur > ofs['tar_cur'] and tar_cur > ofs['src_cur']:
127 1
                        del offset_arr[i]
128
                    else:
129
                        i += 1
130 1
131 1
                offset_arr.append(
132 1
                    {'src_cur': src_cur, 'tar_cur': tar_cur, 'trans': is_trans}
133 1
                )
134 1
            else:
135 1
                lcss += local_cs
136 1
                local_cs = 0
137 1
                if src_cur != tar_cur:
138
                    src_cur = tar_cur = min(src_cur, tar_cur)
139 1
                for i in range(self._max_offset):
140
                    if not (
141 1
                        (src_cur + i < src_len) or (tar_cur + i < tar_len)
142
                    ):
143
                        break
144
                    if (src_cur + i < src_len) and (
145 1
                        src[src_cur + i] == tar[tar_cur]
146 1
                    ):
147 1
                        src_cur += i - 1
148 1
                        tar_cur -= 1
149 1
                        break
150 1
                    if (tar_cur + i < tar_len) and (
151
                        src[src_cur] == tar[tar_cur + i]
152
                    ):
153 1
                        src_cur -= 1
154 1
                        tar_cur += i - 1
155
                        break
156
157 1
            src_cur += 1
158 1
            tar_cur += 1
159 1
160 1
            if self._max_distance:
161
                temporary_distance = max(src_cur, tar_cur) - lcss + trans
162
                if temporary_distance >= self._max_distance:
163 1
                    return round(temporary_distance)
164 1
165 1
            if (src_cur >= src_len) or (tar_cur >= tar_len):
166
                lcss += local_cs
167 1
                local_cs = 0
168 1
                src_cur = tar_cur = min(src_cur, tar_cur)
169
170 1
        lcss += local_cs
171 1
        return round(max(src_len, tar_len) - lcss + trans)
172 1
173 1
    def dist(self, src: str, tar: str) -> float:
174
        """Return the normalized "common" Sift4 distance between two terms.
175 1
176 1
        This is Sift4 distance, normalized to [0, 1].
177 1
178 1
        Parameters
179
        ----------
180 1
        src : str
181 1
            Source string for comparison
182
        tar : str
183 1
            Target string for comparison
184
185
        Returns
186
        -------
187
        float
188
            The normalized Sift4 distance
189
190
        Examples
191
        --------
192
        >>> cmp = Sift4()
193
        >>> round(cmp.dist('cat', 'hat'), 12)
194
        0.333333333333
195
        >>> cmp.dist('Niall', 'Neil')
196
        0.4
197
        >>> cmp.dist('Colin', 'Cuilen')
198
        0.5
199
        >>> cmp.dist('ATCG', 'TAGC')
200
        0.5
201
202
203
        .. versionadded:: 0.3.0
204
        .. versionchanged:: 0.3.6
205
            Encapsulated in class
206
207
        """
208
        return self.dist_abs(src, tar) / (max(len(src), len(tar), 1))
209
210
211
if __name__ == '__main__':
212
    import doctest
213
214
    doctest.testmod()
215