Completed
Pull Request — master (#141)
by Chris
11:04
created

abydos.distance._sift4   A

Complexity

Total Complexity 29

Size/Duplication

Total Lines 273
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
eloc 86
dl 0
loc 273
ccs 73
cts 73
cp 1
rs 10
c 0
b 0
f 0
wmc 29

2 Methods

Rating   Name   Duplication   Size   Complexity  
F Sift4.dist_abs() 0 104 25
A Sift4.dist() 0 29 1

3 Functions

Rating   Name   Duplication   Size   Complexity  
A dist_sift4() 0 27 1
A sift4_common() 0 27 1
A sim_sift4() 0 27 1
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._sift4.
20
21
Sift4 Common approximate string distance
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from six.moves import range
32
33 1
from ._distance import _Distance
34
35 1
__all__ = ['Sift4', 'dist_sift4', 'sift4_common', 'sim_sift4']
36
37
38 1
class Sift4(_Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
39
    """Sift4 Common version.
40
41
    This is an approximation of edit distance, described in
42
    :cite:`Zackwehdex:2014`.
43
    """
44
45 1
    def dist_abs(self, src, tar, max_offset=5, max_distance=0):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (18/15).
Loading history...
Bug introduced by
Parameters differ from overridden 'dist_abs' method
Loading history...
46
        """Return the "common" Sift4 distance between two terms.
47
48
        Args:
49
            src (str): Source string for comparison
50
            tar (str): Target string for comparison
51
            max_offset (int): the number of characters to search for matching
52
                letters
53
            max_distance (int): the distance at which to stop and exit
54
55
        Returns:
56
            int: The Sift4 distance according to the common formula
57
58
        Examples:
59
            >>> cmp = Sift4()
60
            >>> cmp.dist_abs('cat', 'hat')
61
            1
62
            >>> cmp.dist_abs('Niall', 'Neil')
63
            2
64
            >>> cmp.dist_abs('Colin', 'Cuilen')
65
            3
66
            >>> cmp.dist_abs('ATCG', 'TAGC')
67
            2
68
69
        """
70 1
        if not src:
71 1
            return len(tar)
72
73 1
        if not tar:
74 1
            return len(src)
75
76 1
        src_len = len(src)
77 1
        tar_len = len(tar)
78
79 1
        src_cur = 0
80 1
        tar_cur = 0
81 1
        lcss = 0
82 1
        local_cs = 0
83 1
        trans = 0
84 1
        offset_arr = []
85
86 1
        while (src_cur < src_len) and (tar_cur < tar_len):
87 1
            if src[src_cur] == tar[tar_cur]:
88 1
                local_cs += 1
89 1
                is_trans = False
90 1
                i = 0
91 1
                while i < len(offset_arr):
92 1
                    ofs = offset_arr[i]
93 1
                    if src_cur <= ofs['src_cur'] or tar_cur <= ofs['tar_cur']:
94 1
                        is_trans = abs(tar_cur - src_cur) >= abs(
95
                            ofs['tar_cur'] - ofs['src_cur']
96
                        )
97 1
                        if is_trans:
98 1
                            trans += 1
99 1
                        elif not ofs['trans']:
100 1
                            ofs['trans'] = True
101 1
                            trans += 1
102 1
                        break
103 1
                    elif src_cur > ofs['tar_cur'] and tar_cur > ofs['src_cur']:
104 1
                        del offset_arr[i]
105
                    else:
106 1
                        i += 1
107
108 1
                offset_arr.append(
109
                    {'src_cur': src_cur, 'tar_cur': tar_cur, 'trans': is_trans}
110
                )
111
            else:
112 1
                lcss += local_cs
113 1
                local_cs = 0
114 1
                if src_cur != tar_cur:
115 1
                    src_cur = tar_cur = min(src_cur, tar_cur)
116 1
                for i in range(max_offset):
117 1
                    if not (
118
                        (src_cur + i < src_len) or (tar_cur + i < tar_len)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
119
                    ):
120 1
                        break
121 1
                    if (src_cur + i < src_len) and (
122
                        src[src_cur + i] == tar[tar_cur]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
123
                    ):
124 1
                        src_cur += i - 1
125 1
                        tar_cur -= 1
126 1
                        break
127 1
                    if (tar_cur + i < tar_len) and (
128
                        src[src_cur] == tar[tar_cur + i]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
129
                    ):
130 1
                        src_cur -= 1
131 1
                        tar_cur += i - 1
132 1
                        break
133
134 1
            src_cur += 1
135 1
            tar_cur += 1
136
137 1
            if max_distance:
138 1
                temporary_distance = max(src_cur, tar_cur) - lcss + trans
139 1
                if temporary_distance >= max_distance:
140 1
                    return round(temporary_distance)
141
142 1
            if (src_cur >= src_len) or (tar_cur >= tar_len):
143 1
                lcss += local_cs
144 1
                local_cs = 0
145 1
                src_cur = tar_cur = min(src_cur, tar_cur)
146
147 1
        lcss += local_cs
148 1
        return round(max(src_len, tar_len) - lcss + trans)
149
150 1
    def dist(self, src, tar, max_offset=5, max_distance=0):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist' method
Loading history...
151
        """Return the normalized "common" Sift4 distance between two terms.
152
153
        This is Sift4 distance, normalized to [0, 1].
154
155
        Args:
156
            src (str): Source string for comparison
157
            tar (str): Target string for comparison
158
            max_offset (int): the number of characters to search for matching
159
                letters
160
            max_distance (int): the distance at which to stop and exit
161
162
        Returns:
163
            float: The normalized Sift4 distance
164
165
        Examples:
166
            >>> cmp = Sift4()
167
            >>> round(cmp.dist('cat', 'hat'), 12)
168
            0.333333333333
169
            >>> cmp.dist('Niall', 'Neil')
170
            0.4
171
            >>> cmp.dist('Colin', 'Cuilen')
172
            0.5
173
            >>> cmp.dist('ATCG', 'TAGC')
174
            0.5
175
176
        """
177 1
        return self.dist_abs(src, tar, max_offset, max_distance) / (
178
            max(len(src), len(tar), 1)
179
        )
180
181
182 1
def sift4_common(src, tar, max_offset=5, max_distance=0):
183
    """Return the "common" Sift4 distance between two terms.
184
185
    This is a wrapper for :py:meth:`Sift4.dist_abs`.
186
187
    Args:
188
        src (str): Source string for comparison
189
        tar (str): Target string for comparison
190
        max_offset (int): the number of characters to search for matching
191
            letters
192
        max_distance (int): the distance at which to stop and exit
193
194
    Returns:
195
        int: The Sift4 distance according to the common formula
196
197
    Examples:
198
        >>> sift4_common('cat', 'hat')
199
        1
200
        >>> sift4_common('Niall', 'Neil')
201
        2
202
        >>> sift4_common('Colin', 'Cuilen')
203
        3
204
        >>> sift4_common('ATCG', 'TAGC')
205
        2
206
207
    """
208 1
    return Sift4().dist_abs(src, tar, max_offset, max_distance)
209
210
211 1
def dist_sift4(src, tar, max_offset=5, max_distance=0):
212
    """Return the normalized "common" Sift4 distance between two terms.
213
214
    This is a wrapper for :py:meth:`Sift4.dist`.
215
216
    Args:
217
        src (str): Source string for comparison
218
        tar (str): Target string for comparison
219
        max_offset (int): the number of characters to search for matching
220
            letters
221
        max_distance (int): the distance at which to stop and exit
222
223
    Returns:
224
        float: The normalized Sift4 distance
225
226
    Examples:
227
        >>> round(dist_sift4('cat', 'hat'), 12)
228
        0.333333333333
229
        >>> dist_sift4('Niall', 'Neil')
230
        0.4
231
        >>> dist_sift4('Colin', 'Cuilen')
232
        0.5
233
        >>> dist_sift4('ATCG', 'TAGC')
234
        0.5
235
236
    """
237 1
    return Sift4().dist(src, tar, max_offset, max_distance)
238
239
240 1
def sim_sift4(src, tar, max_offset=5, max_distance=0):
241
    """Return the normalized "common" Sift4 similarity of two terms.
242
243
    This is a wrapper for :py:meth:`Sift4.sim`.
244
245
    Args:
246
        src (str): Source string for comparison
247
        tar (str): Target string for comparison
248
        max_offset (int): the number of characters to search for matching
249
            letters
250
        max_distance (int): the distance at which to stop and exit
251
252
    Returns:
253
        float: The normalized Sift4 similarity
254
255
    Examples:
256
        >>> round(sim_sift4('cat', 'hat'), 12)
257
        0.666666666667
258
        >>> sim_sift4('Niall', 'Neil')
259
        0.6
260
        >>> sim_sift4('Colin', 'Cuilen')
261
        0.5
262
        >>> sim_sift4('ATCG', 'TAGC')
263
        0.5
264
265
    """
266 1
    return Sift4().sim(src, tar, max_offset, max_distance)
267
268
269
if __name__ == '__main__':
270
    import doctest
271
272
    doctest.testmod()
273