Completed
Branch master (78a222)
by Chris
14:36
created

abydos.distance._sequence   A

Complexity

Total Complexity 33

Size/Duplication

Total Lines 360
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
wmc 33
eloc 95
dl 0
loc 360
ccs 78
cts 78
cp 1
rs 9.76
c 0
b 0
f 0

8 Functions

Rating   Name   Duplication   Size   Complexity  
A sim_lcsstr() 0 27 4
A dist_lcsstr() 0 23 1
C sim_ratcliff_obershelp() 0 80 9
A dist_lcsseq() 0 23 1
A sim_lcsseq() 0 27 4
A dist_ratcliff_obershelp() 0 22 1
B lcsseq() 0 52 8
A lcsstr() 0 41 5
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance.sequence.
20
21
The distance.sequence module implements subsequence and substring edit distance
22
functions, along with Ratcliff-Obershelp similarity & distance.
23
"""
24
25 1
from __future__ import division, unicode_literals
26
27 1
from numpy import int as np_int
28 1
from numpy import zeros as np_zeros
29
30 1
from six.moves import range
31
32
33 1
__all__ = [
34
    'dist_lcsseq',
35
    'dist_lcsstr',
36
    'dist_ratcliff_obershelp',
37
    'lcsseq',
38
    'lcsstr',
39
    'sim_lcsseq',
40
    'sim_lcsstr',
41
    'sim_ratcliff_obershelp',
42
]
43
44
45 1
def lcsseq(src, tar):
46
    """Return the longest common subsequence of two strings.
47
48
    Longest common subsequence (LCSseq) is the longest subsequence of
49
    characters that two strings have in common.
50
51
    Based on the dynamic programming algorithm from
52
    http://rosettacode.org/wiki/Longest_common_subsequence#Dynamic_Programming_6
53
    :cite:`rosettacode:2018b`. This is licensed GFDL 1.2.
54
55
    Modifications include:
56
        conversion to a numpy array in place of a list of lists
57
58
    :param str src: source string for comparison
59
    :param str tar: target string for comparison
60
    :returns: the longest common subsequence
61
    :rtype: str
62
63
    >>> lcsseq('cat', 'hat')
64
    'at'
65
    >>> lcsseq('Niall', 'Neil')
66
    'Nil'
67
    >>> lcsseq('aluminum', 'Catalan')
68
    'aln'
69
    >>> lcsseq('ATCG', 'TAGC')
70
    'AC'
71
    """
72 1
    lengths = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int)
73
74
    # row 0 and column 0 are initialized to 0 already
75 1
    for i, src_char in enumerate(src):
76 1
        for j, tar_char in enumerate(tar):
77 1
            if src_char == tar_char:
78 1
                lengths[i + 1, j + 1] = lengths[i, j] + 1
79
            else:
80 1
                lengths[i + 1, j + 1] = max(
81
                    lengths[i + 1, j], lengths[i, j + 1]
82
                )
83
84
    # read the substring out from the matrix
85 1
    result = ''
86 1
    i, j = len(src), len(tar)
87 1
    while i != 0 and j != 0:
88 1
        if lengths[i, j] == lengths[i - 1, j]:
89 1
            i -= 1
90 1
        elif lengths[i, j] == lengths[i, j - 1]:
91 1
            j -= 1
92
        else:
93 1
            result = src[i - 1] + result
94 1
            i -= 1
95 1
            j -= 1
96 1
    return result
97
98
99 1
def sim_lcsseq(src, tar):
100
    r"""Return the longest common subsequence similarity of two strings.
101
102
    Longest common subsequence similarity (:math:`sim_{LCSseq}`).
103
104
    This employs the LCSseq function to derive a similarity metric:
105
    :math:`sim_{LCSseq}(s,t) = \\frac{|LCSseq(s,t)|}{max(|s|, |t|)}`
106
107
    :param str src: source string for comparison
108
    :param str tar: target string for comparison
109
    :returns: LCSseq similarity
110
    :rtype: float
111
112
    >>> sim_lcsseq('cat', 'hat')
113
    0.6666666666666666
114
    >>> sim_lcsseq('Niall', 'Neil')
115
    0.6
116
    >>> sim_lcsseq('aluminum', 'Catalan')
117
    0.375
118
    >>> sim_lcsseq('ATCG', 'TAGC')
119
    0.5
120
    """
121 1
    if src == tar:
122 1
        return 1.0
123 1
    elif not src or not tar:
124 1
        return 0.0
125 1
    return len(lcsseq(src, tar)) / max(len(src), len(tar))
126
127
128 1
def dist_lcsseq(src, tar):
129
    """Return the longest common subsequence distance between two strings.
130
131
    Longest common subsequence distance (:math:`dist_{LCSseq}`).
132
133
    This employs the LCSseq function to derive a similarity metric:
134
    :math:`dist_{LCSseq}(s,t) = 1 - sim_{LCSseq}(s,t)`
135
136
    :param str src: source string for comparison
137
    :param str tar: target string for comparison
138
    :returns: LCSseq distance
139
    :rtype: float
140
141
    >>> dist_lcsseq('cat', 'hat')
142
    0.33333333333333337
143
    >>> dist_lcsseq('Niall', 'Neil')
144
    0.4
145
    >>> dist_lcsseq('aluminum', 'Catalan')
146
    0.625
147
    >>> dist_lcsseq('ATCG', 'TAGC')
148
    0.5
149
    """
150 1
    return 1 - sim_lcsseq(src, tar)
151
152
153 1
def lcsstr(src, tar):
154
    """Return the longest common substring of two strings.
155
156
    Longest common substring (LCSstr).
157
158
    Based on the code from
159
    https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring#Python
160
    :cite:`Wikibooks:2018`.
161
    This is licensed Creative Commons: Attribution-ShareAlike 3.0.
162
163
    Modifications include:
164
165
        - conversion to a numpy array in place of a list of lists
166
        - conversion to Python 2/3-safe range from xrange via six
167
168
    :param str src: source string for comparison
169
    :param str tar: target string for comparison
170
    :returns: the longest common substring
171
    :rtype: str
172
173
    >>> lcsstr('cat', 'hat')
174
    'at'
175
    >>> lcsstr('Niall', 'Neil')
176
    'N'
177
    >>> lcsstr('aluminum', 'Catalan')
178
    'al'
179
    >>> lcsstr('ATCG', 'TAGC')
180
    'A'
181
    """
182 1
    lengths = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int)
183 1
    longest, i_longest = 0, 0
184 1
    for i in range(1, len(src) + 1):
185 1
        for j in range(1, len(tar) + 1):
186 1
            if src[i - 1] == tar[j - 1]:
187 1
                lengths[i, j] = lengths[i - 1, j - 1] + 1
188 1
                if lengths[i, j] > longest:
189 1
                    longest = lengths[i, j]
190 1
                    i_longest = i
191
            else:
192 1
                lengths[i, j] = 0
193 1
    return src[i_longest - longest : i_longest]
194
195
196 1
def sim_lcsstr(src, tar):
197
    r"""Return the longest common substring similarity of two strings.
198
199
    Longest common substring similarity (:math:`sim_{LCSstr}`).
200
201
    This employs the LCS function to derive a similarity metric:
202
    :math:`sim_{LCSstr}(s,t) = \\frac{|LCSstr(s,t)|}{max(|s|, |t|)}`
203
204
    :param str src: source string for comparison
205
    :param str tar: target string for comparison
206
    :returns: LCSstr similarity
207
    :rtype: float
208
209
    >>> sim_lcsstr('cat', 'hat')
210
    0.6666666666666666
211
    >>> sim_lcsstr('Niall', 'Neil')
212
    0.2
213
    >>> sim_lcsstr('aluminum', 'Catalan')
214
    0.25
215
    >>> sim_lcsstr('ATCG', 'TAGC')
216
    0.25
217
    """
218 1
    if src == tar:
219 1
        return 1.0
220 1
    elif not src or not tar:
221 1
        return 0.0
222 1
    return len(lcsstr(src, tar)) / max(len(src), len(tar))
223
224
225 1
def dist_lcsstr(src, tar):
226
    """Return the longest common substring distance between two strings.
227
228
    Longest common substring distance (:math:`dist_{LCSstr}`).
229
230
    This employs the LCS function to derive a similarity metric:
231
    :math:`dist_{LCSstr}(s,t) = 1 - sim_{LCSstr}(s,t)`
232
233
    :param str src: source string for comparison
234
    :param str tar: target string for comparison
235
    :returns: LCSstr distance
236
    :rtype: float
237
238
    >>> dist_lcsstr('cat', 'hat')
239
    0.33333333333333337
240
    >>> dist_lcsstr('Niall', 'Neil')
241
    0.8
242
    >>> dist_lcsstr('aluminum', 'Catalan')
243
    0.75
244
    >>> dist_lcsstr('ATCG', 'TAGC')
245
    0.75
246
    """
247 1
    return 1 - sim_lcsstr(src, tar)
248
249
250 1
def sim_ratcliff_obershelp(src, tar):
251
    """Return the Ratcliff-Obershelp similarity of two strings.
252
253
    This follows the Ratcliff-Obershelp algorithm :cite:`Ratcliff:1988` to
254
    derive a similarity measure:
255
256
        1. Find the length of the longest common substring in src & tar.
257
        2. Recurse on the strings to the left & right of each this substring
258
           in src & tar. The base case is a 0 length common substring, in which
259
           case, return 0. Otherwise, return the sum of the current longest
260
           common substring and the left & right recursed sums.
261
        3. Multiply this length by 2 and divide by the sum of the lengths of
262
           src & tar.
263
264
    Cf.
265
    http://www.drdobbs.com/database/pattern-matching-the-gestalt-approach/184407970
266
267
    :param str src: source string for comparison
268
    :param str tar: target string for comparison
269
    :returns: Ratcliff-Obershelp similarity
270
    :rtype: float
271
272
    >>> round(sim_ratcliff_obershelp('cat', 'hat'), 12)
273
    0.666666666667
274
    >>> round(sim_ratcliff_obershelp('Niall', 'Neil'), 12)
275
    0.666666666667
276
    >>> round(sim_ratcliff_obershelp('aluminum', 'Catalan'), 12)
277
    0.4
278
    >>> sim_ratcliff_obershelp('ATCG', 'TAGC')
279
    0.5
280
    """
281
282 1
    def _lcsstr_stl(src, tar):
283
        """Return start positions & length for Ratcliff-Obershelp.
284
285
        Return the start position in the source string, start position in
286
        the target string, and length of the longest common substring of
287
        strings src and tar.
288
        """
289 1
        lengths = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int)
290 1
        longest, src_longest, tar_longest = 0, 0, 0
291 1
        for i in range(1, len(src) + 1):
292 1
            for j in range(1, len(tar) + 1):
293 1
                if src[i - 1] == tar[j - 1]:
294 1
                    lengths[i, j] = lengths[i - 1, j - 1] + 1
295 1
                    if lengths[i, j] > longest:
296 1
                        longest = lengths[i, j]
297 1
                        src_longest = i
298 1
                        tar_longest = j
299
                else:
300 1
                    lengths[i, j] = 0
301 1
        return src_longest - longest, tar_longest - longest, longest
302
303 1
    def _sstr_matches(src, tar):
304
        """Return the sum of substring match lengths.
305
306
        This follows the Ratcliff-Obershelp algorithm :cite:`Ratcliff:1988`:
307
             1. Find the length of the longest common substring in src & tar.
308
             2. Recurse on the strings to the left & right of each this
309
                 substring in src & tar.
310
             3. Base case is a 0 length common substring, in which case,
311
                 return 0.
312
             4. Return the sum.
313
        """
314 1
        src_start, tar_start, length = _lcsstr_stl(src, tar)
315 1
        if length == 0:
316 1
            return 0
317 1
        return (
318
            _sstr_matches(src[:src_start], tar[:tar_start])
319
            + length
320
            + _sstr_matches(
321
                src[src_start + length :], tar[tar_start + length :]
322
            )
323
        )
324
325 1
    if src == tar:
326 1
        return 1.0
327 1
    elif not src or not tar:
328 1
        return 0.0
329 1
    return 2 * _sstr_matches(src, tar) / (len(src) + len(tar))
330
331
332 1
def dist_ratcliff_obershelp(src, tar):
333
    """Return the Ratcliff-Obershelp distance between two strings.
334
335
    Ratcliff-Obsershelp distance the complement of Ratcliff-Obershelp
336
    similarity:
337
    :math:`dist_{Ratcliff-Obershelp} = 1 - sim_{Ratcliff-Obershelp}`.
338
339
    :param str src: source string for comparison
340
    :param str tar: target string for comparison
341
    :returns: Ratcliff-Obershelp distance
342
    :rtype: float
343
344
    >>> round(dist_ratcliff_obershelp('cat', 'hat'), 12)
345
    0.333333333333
346
    >>> round(dist_ratcliff_obershelp('Niall', 'Neil'), 12)
347
    0.333333333333
348
    >>> round(dist_ratcliff_obershelp('aluminum', 'Catalan'), 12)
349
    0.6
350
    >>> dist_ratcliff_obershelp('ATCG', 'TAGC')
351
    0.5
352
    """
353 1
    return 1 - sim_ratcliff_obershelp(src, tar)
354
355
356
if __name__ == '__main__':
357
    import doctest
358
359
    doctest.testmod()
360