Completed
Push — master ( f43547...71985b )
by Chris
12:00 queued 10s
created

abydos.distance._sift4.Sift4.dist()   A

Complexity

Conditions 1

Size

Total Lines 36
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 3
nop 5
dl 0
loc 36
ccs 2
cts 2
cp 1
crap 1
rs 10
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._sift4.
20
21
Sift4 Common approximate string distance
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from six.moves import range
32
33 1
from ._distance import _Distance
34
35 1
__all__ = ['Sift4', 'dist_sift4', 'sift4_common', 'sim_sift4']
36
37
38 1
class Sift4(_Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
39
    """Sift4 Common version.
40
41
    This is an approximation of edit distance, described in
42
    :cite:`Zackwehdex:2014`.
43
    """
44
45 1
    def dist_abs(self, src, tar, max_offset=5, max_distance=0):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (18/15).
Loading history...
Bug introduced by
Parameters differ from overridden 'dist_abs' method
Loading history...
46
        """Return the "common" Sift4 distance between two terms.
47
48
        Parameters
49
        ----------
50
        src : str
51
            Source string for comparison
52
        tar : str
53
            Target string for comparison
54
        max_offset : int
55
            The number of characters to search for matching letters
56
        max_distance : int
57
            The distance at which to stop and exit
58
59
        Returns
60
        -------
61
        int
62
            The Sift4 distance according to the common formula
63
64
        Examples
65
        --------
66
        >>> cmp = Sift4()
67
        >>> cmp.dist_abs('cat', 'hat')
68
        1
69
        >>> cmp.dist_abs('Niall', 'Neil')
70
        2
71
        >>> cmp.dist_abs('Colin', 'Cuilen')
72
        3
73
        >>> cmp.dist_abs('ATCG', 'TAGC')
74
        2
75
76
        """
77 1
        if not src:
78 1
            return len(tar)
79
80 1
        if not tar:
81 1
            return len(src)
82
83 1
        src_len = len(src)
84 1
        tar_len = len(tar)
85
86 1
        src_cur = 0
87 1
        tar_cur = 0
88 1
        lcss = 0
89 1
        local_cs = 0
90 1
        trans = 0
91 1
        offset_arr = []
92
93 1
        while (src_cur < src_len) and (tar_cur < tar_len):
94 1
            if src[src_cur] == tar[tar_cur]:
95 1
                local_cs += 1
96 1
                is_trans = False
97 1
                i = 0
98 1
                while i < len(offset_arr):
99 1
                    ofs = offset_arr[i]
100 1
                    if src_cur <= ofs['src_cur'] or tar_cur <= ofs['tar_cur']:
101 1
                        is_trans = abs(tar_cur - src_cur) >= abs(
102
                            ofs['tar_cur'] - ofs['src_cur']
103
                        )
104 1
                        if is_trans:
105 1
                            trans += 1
106 1
                        elif not ofs['trans']:
107 1
                            ofs['trans'] = True
108 1
                            trans += 1
109 1
                        break
110 1
                    elif src_cur > ofs['tar_cur'] and tar_cur > ofs['src_cur']:
111 1
                        del offset_arr[i]
112
                    else:
113 1
                        i += 1
114
115 1
                offset_arr.append(
116
                    {'src_cur': src_cur, 'tar_cur': tar_cur, 'trans': is_trans}
117
                )
118
            else:
119 1
                lcss += local_cs
120 1
                local_cs = 0
121 1
                if src_cur != tar_cur:
122 1
                    src_cur = tar_cur = min(src_cur, tar_cur)
123 1
                for i in range(max_offset):
124 1
                    if not (
125
                        (src_cur + i < src_len) or (tar_cur + i < tar_len)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
126
                    ):
127 1
                        break
128 1
                    if (src_cur + i < src_len) and (
129
                        src[src_cur + i] == tar[tar_cur]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
130
                    ):
131 1
                        src_cur += i - 1
132 1
                        tar_cur -= 1
133 1
                        break
134 1
                    if (tar_cur + i < tar_len) and (
135
                        src[src_cur] == tar[tar_cur + i]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
136
                    ):
137 1
                        src_cur -= 1
138 1
                        tar_cur += i - 1
139 1
                        break
140
141 1
            src_cur += 1
142 1
            tar_cur += 1
143
144 1
            if max_distance:
145 1
                temporary_distance = max(src_cur, tar_cur) - lcss + trans
146 1
                if temporary_distance >= max_distance:
147 1
                    return round(temporary_distance)
148
149 1
            if (src_cur >= src_len) or (tar_cur >= tar_len):
150 1
                lcss += local_cs
151 1
                local_cs = 0
152 1
                src_cur = tar_cur = min(src_cur, tar_cur)
153
154 1
        lcss += local_cs
155 1
        return round(max(src_len, tar_len) - lcss + trans)
156
157 1
    def dist(self, src, tar, max_offset=5, max_distance=0):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist' method
Loading history...
158
        """Return the normalized "common" Sift4 distance between two terms.
159
160
        This is Sift4 distance, normalized to [0, 1].
161
162
        Parameters
163
        ----------
164
        src : str
165
            Source string for comparison
166
        tar : str
167
            Target string for comparison
168
        max_offset : int
169
            The number of characters to search for matching letters
170
        max_distance : int
171
            The distance at which to stop and exit
172
173
        Returns
174
        -------
175
        float
176
            The normalized Sift4 distance
177
178
        Examples
179
        --------
180
        >>> cmp = Sift4()
181
        >>> round(cmp.dist('cat', 'hat'), 12)
182
        0.333333333333
183
        >>> cmp.dist('Niall', 'Neil')
184
        0.4
185
        >>> cmp.dist('Colin', 'Cuilen')
186
        0.5
187
        >>> cmp.dist('ATCG', 'TAGC')
188
        0.5
189
190
        """
191 1
        return self.dist_abs(src, tar, max_offset, max_distance) / (
192
            max(len(src), len(tar), 1)
193
        )
194
195
196 1
def sift4_common(src, tar, max_offset=5, max_distance=0):
197
    """Return the "common" Sift4 distance between two terms.
198
199
    This is a wrapper for :py:meth:`Sift4.dist_abs`.
200
201
    Parameters
202
    ----------
203
    src : str
204
        Source string for comparison
205
    tar : str
206
        Target string for comparison
207
    max_offset : int
208
        The number of characters to search for matching letters
209
    max_distance : int
210
        The distance at which to stop and exit
211
212
    Returns
213
    -------
214
    int
215
        The Sift4 distance according to the common formula
216
217
    Examples
218
    --------
219
    >>> sift4_common('cat', 'hat')
220
    1
221
    >>> sift4_common('Niall', 'Neil')
222
    2
223
    >>> sift4_common('Colin', 'Cuilen')
224
    3
225
    >>> sift4_common('ATCG', 'TAGC')
226
    2
227
228
    """
229 1
    return Sift4().dist_abs(src, tar, max_offset, max_distance)
230
231
232 1
def dist_sift4(src, tar, max_offset=5, max_distance=0):
233
    """Return the normalized "common" Sift4 distance between two terms.
234
235
    This is a wrapper for :py:meth:`Sift4.dist`.
236
237
    Parameters
238
    ----------
239
    src : str
240
        Source string for comparison
241
    tar : str
242
        Target string for comparison
243
    max_offset : int
244
        The number of characters to search for matching letters
245
    max_distance : int
246
        The distance at which to stop and exit
247
248
    Returns
249
    -------
250
    float
251
        The normalized Sift4 distance
252
253
    Examples
254
    --------
255
    >>> round(dist_sift4('cat', 'hat'), 12)
256
    0.333333333333
257
    >>> dist_sift4('Niall', 'Neil')
258
    0.4
259
    >>> dist_sift4('Colin', 'Cuilen')
260
    0.5
261
    >>> dist_sift4('ATCG', 'TAGC')
262
    0.5
263
264
    """
265 1
    return Sift4().dist(src, tar, max_offset, max_distance)
266
267
268 1
def sim_sift4(src, tar, max_offset=5, max_distance=0):
269
    """Return the normalized "common" Sift4 similarity of two terms.
270
271
    This is a wrapper for :py:meth:`Sift4.sim`.
272
273
    Parameters
274
    ----------
275
    src : str
276
        Source string for comparison
277
    tar : str
278
        Target string for comparison
279
    max_offset : int
280
        The number of characters to search for matching letters
281
    max_distance : int
282
        The distance at which to stop and exit
283
284
    Returns
285
    -------
286
    float
287
        The normalized Sift4 similarity
288
289
    Examples
290
    --------
291
    >>> round(sim_sift4('cat', 'hat'), 12)
292
    0.666666666667
293
    >>> sim_sift4('Niall', 'Neil')
294
    0.6
295
    >>> sim_sift4('Colin', 'Cuilen')
296
    0.5
297
    >>> sim_sift4('ATCG', 'TAGC')
298
    0.5
299
300
    """
301 1
    return Sift4().sim(src, tar, max_offset, max_distance)
302
303
304
if __name__ == '__main__':
305
    import doctest
306
307
    doctest.testmod()
308