Completed
Pull Request — master (#141)
by Chris
13:03
created

abydos.distance._sift4.sift4_common()   A

Complexity

Conditions 1

Size

Total Lines 28
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 28
ccs 2
cts 2
cp 1
rs 10
c 0
b 0
f 0
cc 1
nop 4
crap 1
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance.sift4.
20
21
The distance.sift4 module implements Sift4 approximate string distance
22
functions.
23
"""
24
25 1
from __future__ import division, unicode_literals
26
27 1
from six.moves import range
28
29 1
from ._distance import Distance
30
31 1
__all__ = [
32
    'Sift4',
33
    'Sift4Simplest',
34
    'dist_sift4',
35
    'sift4_common',
36
    'sift4_simplest',
37
    'sim_sift4',
38
]
39
40
41 1
class Sift4(Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
42
    """Sift4 Common version.
43
44
    This is an approximation of edit distance, described in
45
    :cite:`Zackwehdex:2014`.
46
    """
47
48 1
    def dist_abs(self, src, tar, max_offset=5, max_distance=0):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (18/15).
Loading history...
Bug introduced by
Parameters differ from overridden 'dist_abs' method
Loading history...
49
        """Return the "common" Sift4 distance between two terms.
50
51
        Args:
52
            src (str): Source string for comparison
53
            tar (str): Target string for comparison
54
            max_offset (int): the number of characters to search for matching
55
                letters
56
            max_distance (int): the distance at which to stop and exit
57
58
        Returns:
59
            int: The Sift4 distance according to the common formula
60
61
        Examples:
62
            >>> cmp = Sift4()
63
            >>> cmp.dist_abs('cat', 'hat')
64
            1
65
            >>> cmp.dist_abs('Niall', 'Neil')
66
            2
67
            >>> cmp.dist_abs('Colin', 'Cuilen')
68
            3
69
            >>> cmp.dist_abs('ATCG', 'TAGC')
70
            2
71
72
        """
73 1
        if not src:
74 1
            return len(tar)
75
76 1
        if not tar:
77 1
            return len(src)
78
79 1
        src_len = len(src)
80 1
        tar_len = len(tar)
81
82 1
        src_cur = 0
83 1
        tar_cur = 0
84 1
        lcss = 0
85 1
        local_cs = 0
86 1
        trans = 0
87 1
        offset_arr = []
88
89 1
        while (src_cur < src_len) and (tar_cur < tar_len):
90 1
            if src[src_cur] == tar[tar_cur]:
91 1
                local_cs += 1
92 1
                is_trans = False
93 1
                i = 0
94 1
                while i < len(offset_arr):
95 1
                    ofs = offset_arr[i]
96 1
                    if src_cur <= ofs['src_cur'] or tar_cur <= ofs['tar_cur']:
97 1
                        is_trans = abs(tar_cur - src_cur) >= abs(
98
                            ofs['tar_cur'] - ofs['src_cur']
99
                        )
100 1
                        if is_trans:
101 1
                            trans += 1
102 1
                        elif not ofs['trans']:
103 1
                            ofs['trans'] = True
104 1
                            trans += 1
105 1
                        break
106 1
                    elif src_cur > ofs['tar_cur'] and tar_cur > ofs['src_cur']:
107 1
                        del offset_arr[i]
108
                    else:
109 1
                        i += 1
110
111 1
                offset_arr.append(
112
                    {'src_cur': src_cur, 'tar_cur': tar_cur, 'trans': is_trans}
113
                )
114
            else:
115 1
                lcss += local_cs
116 1
                local_cs = 0
117 1
                if src_cur != tar_cur:
118 1
                    src_cur = tar_cur = min(src_cur, tar_cur)
119 1
                for i in range(max_offset):
120 1
                    if not (
121
                        (src_cur + i < src_len) or (tar_cur + i < tar_len)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
122
                    ):
123 1
                        break
124 1
                    if (src_cur + i < src_len) and (
125
                        src[src_cur + i] == tar[tar_cur]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
126
                    ):
127 1
                        src_cur += i - 1
128 1
                        tar_cur -= 1
129 1
                        break
130 1
                    if (tar_cur + i < tar_len) and (
131
                        src[src_cur] == tar[tar_cur + i]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
132
                    ):
133 1
                        src_cur -= 1
134 1
                        tar_cur += i - 1
135 1
                        break
136
137 1
            src_cur += 1
138 1
            tar_cur += 1
139
140 1
            if max_distance:
141 1
                temporary_distance = max(src_cur, tar_cur) - lcss + trans
142 1
                if temporary_distance >= max_distance:
143 1
                    return round(temporary_distance)
144
145 1
            if (src_cur >= src_len) or (tar_cur >= tar_len):
146 1
                lcss += local_cs
147 1
                local_cs = 0
148 1
                src_cur = tar_cur = min(src_cur, tar_cur)
149
150 1
        lcss += local_cs
151 1
        return round(max(src_len, tar_len) - lcss + trans)
152
153 1
    def dist(self, src, tar, max_offset=5, max_distance=0):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist' method
Loading history...
154
        """Return the normalized "common" Sift4 distance between two terms.
155
156
        This is Sift4 distance, normalized to [0, 1].
157
158
        Args:
159
            src (str): Source string for comparison
160
            tar (str): Target string for comparison
161
            max_offset (int): the number of characters to search for matching
162
                letters
163
            max_distance (int): the distance at which to stop and exit
164
165
        Returns:
166
            float: The normalized Sift4 distance
167
168
        Examples:
169
            >>> cmp = Sift4()
170
            >>> round(cmp.dist('cat', 'hat'), 12)
171
            0.333333333333
172
            >>> cmp.dist('Niall', 'Neil')
173
            0.4
174
            >>> cmp.dist('Colin', 'Cuilen')
175
            0.5
176
            >>> cmp.dist('ATCG', 'TAGC')
177
            0.5
178
179
        """
180 1
        return self.dist_abs(src, tar, max_offset, max_distance) / (
181
            max(len(src), len(tar), 1)
182
        )
183
184
185 1
def sift4_common(src, tar, max_offset=5, max_distance=0):
186
    """Return the "common" Sift4 distance between two terms.
187
188
    This is an approximation of edit distance, described in
189
    :cite:`Zackwehdex:2014`.
190
191
    Args:
192
        src (str): Source string for comparison
193
        tar (str): Target string for comparison
194
        max_offset (int): the number of characters to search for matching
195
            letters
196
        max_distance (int): the distance at which to stop and exit
197
198
    Returns:
199
        int: The Sift4 distance according to the common formula
200
201
    Examples:
202
        >>> sift4_common('cat', 'hat')
203
        1
204
        >>> sift4_common('Niall', 'Neil')
205
        2
206
        >>> sift4_common('Colin', 'Cuilen')
207
        3
208
        >>> sift4_common('ATCG', 'TAGC')
209
        2
210
211
    """
212 1
    return Sift4().dist_abs(src, tar, max_offset, max_distance)
213
214
215 1
def dist_sift4(src, tar, max_offset=5, max_distance=0):
216
    """Return the normalized "common" Sift4 distance between two terms.
217
218
    This is Sift4 distance, normalized to [0, 1].
219
220
    Args:
221
        src (str): Source string for comparison
222
        tar (str): Target string for comparison
223
        max_offset (int): the number of characters to search for matching
224
            letters
225
        max_distance (int): the distance at which to stop and exit
226
227
    Returns:
228
        float: The normalized Sift4 distance
229
230
    Examples:
231
        >>> round(dist_sift4('cat', 'hat'), 12)
232
        0.333333333333
233
        >>> dist_sift4('Niall', 'Neil')
234
        0.4
235
        >>> dist_sift4('Colin', 'Cuilen')
236
        0.5
237
        >>> dist_sift4('ATCG', 'TAGC')
238
        0.5
239
240
    """
241 1
    return Sift4().dist(src, tar, max_offset, max_distance)
242
243
244 1
def sim_sift4(src, tar, max_offset=5, max_distance=0):
245
    """Return the normalized "common" Sift4 similarity of two terms.
246
247
    Normalized Sift4 similarity is the complement of normalized Sift4 distance:
248
    :math:`sim_{Sift4} = 1 - dist_{Sift4}`.
249
250
    Args:
251
        src (str): Source string for comparison
252
        tar (str): Target string for comparison
253
        max_offset (int): the number of characters to search for matching
254
            letters
255
        max_distance (int): the distance at which to stop and exit
256
257
    Returns:
258
        float: The normalized Sift4 similarity
259
260
    Examples:
261
        >>> round(sim_sift4('cat', 'hat'), 12)
262
        0.666666666667
263
        >>> sim_sift4('Niall', 'Neil')
264
        0.6
265
        >>> sim_sift4('Colin', 'Cuilen')
266
        0.5
267
        >>> sim_sift4('ATCG', 'TAGC')
268
        0.5
269
270
    """
271 1
    return Sift4().sim(src, tar, max_offset, max_distance)
272
273
274 1
class Sift4Simplest(Sift4):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
275
    """Sift4 Simplest version.
276
277
    This is an approximation of edit distance, described in
278
    :cite:`Zackwehdex:2014`.
279
    """
280
281 1
    def dist_abs(self, src, tar, max_offset=5):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist_abs' method
Loading history...
282
        """Return the "simplest" Sift4 distance between two terms.
283
284
        Args:
285
            src (str): Source string for comparison
286
            tar (str): Target string for comparison
287
            max_offset (int): the number of characters to search for matching
288
                letters
289
290
        Returns:
291
            int: The Sift4 distance according to the simplest formula
292
293
        Examples:
294
            >>> cmp = Sift4Simplest()
295
            >>> cmp.dist_abs('cat', 'hat')
296
            1
297
            >>> cmp.dist_abs('Niall', 'Neil')
298
            2
299
            >>> cmp.dist_abs('Colin', 'Cuilen')
300
            3
301
            >>> cmp.dist_abs('ATCG', 'TAGC')
302
            2
303
304
        """
305 1
        if not src:
306 1
            return len(tar)
307
308 1
        if not tar:
309 1
            return len(src)
310
311 1
        src_len = len(src)
312 1
        tar_len = len(tar)
313
314 1
        src_cur = 0
315 1
        tar_cur = 0
316 1
        lcss = 0
317 1
        local_cs = 0
318
319 1
        while (src_cur < src_len) and (tar_cur < tar_len):
320 1
            if src[src_cur] == tar[tar_cur]:
321 1
                local_cs += 1
322
            else:
323 1
                lcss += local_cs
324 1
                local_cs = 0
325 1
                if src_cur != tar_cur:
326 1
                    src_cur = tar_cur = max(src_cur, tar_cur)
327 1
                for i in range(max_offset):
328 1
                    if not (
329
                        (src_cur + i < src_len) or (tar_cur + i < tar_len)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
330
                    ):
331 1
                        break
332 1
                    if (src_cur + i < src_len) and (
333
                        src[src_cur + i] == tar[tar_cur]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
334
                    ):
335 1
                        src_cur += i
336 1
                        local_cs += 1
337 1
                        break
338 1
                    if (tar_cur + i < tar_len) and (
339
                        src[src_cur] == tar[tar_cur + i]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
340
                    ):
341 1
                        tar_cur += i
342 1
                        local_cs += 1
343 1
                        break
344
345 1
            src_cur += 1
346 1
            tar_cur += 1
347
348 1
        lcss += local_cs
349 1
        return round(max(src_len, tar_len) - lcss)
350
351
352 1
def sift4_simplest(src, tar, max_offset=5):
353
    """Return the "simplest" Sift4 distance between two terms.
354
355
    Args:
356
        src (str): Source string for comparison
357
        tar (str): Target string for comparison
358
        max_offset (int): the number of characters to search for matching
359
            letters
360
361
    Returns:
362
        int: The Sift4 distance according to the simplest formula
363
364
    Examples:
365
        >>> sift4_simplest('cat', 'hat')
366
        1
367
        >>> sift4_simplest('Niall', 'Neil')
368
        2
369
        >>> sift4_simplest('Colin', 'Cuilen')
370
        3
371
        >>> sift4_simplest('ATCG', 'TAGC')
372
        2
373
374
    """
375 1
    return Sift4Simplest().dist_abs(src, tar, max_offset)
376
377
378
if __name__ == '__main__':
379
    import doctest
380
381
    doctest.testmod()
382