Completed
Pull Request — master (#141)
by Chris
23:14 queued 10:45
created

abydos.distance._lcsseq   A

Complexity

Total Complexity 15

Size/Duplication

Total Lines 247
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
eloc 46
dl 0
loc 247
ccs 37
cts 37
cp 1
rs 10
c 0
b 0
f 0
wmc 15

3 Functions

Rating   Name   Duplication   Size   Complexity  
A lcsseq() 0 30 1
A sim_lcsseq() 0 30 1
A dist_lcsseq() 0 30 1

2 Methods

Rating   Name   Duplication   Size   Complexity  
B LCSseq.lcsseq() 0 60 8
A LCSseq.sim() 0 38 4
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._lcsseq.
20
21
Longest common subsequence
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from numpy import int as np_int
32 1
from numpy import zeros as np_zeros
33
34 1
from ._distance import _Distance
35
36 1
__all__ = ['LCSseq', 'dist_lcsseq', 'lcsseq', 'sim_lcsseq']
37
38
39 1
class LCSseq(_Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
40
    """Longest common subsequence.
41
42
    Longest common subsequence (LCSseq) is the longest subsequence of
43
    characters that two strings have in common.
44
    """
45
46 1
    def lcsseq(self, src, tar):
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
47
        """Return the longest common subsequence of two strings.
48
49
        Based on the dynamic programming algorithm from
50
        http://rosettacode.org/wiki/Longest_common_subsequence
51
        :cite:`rosettacode:2018b`. This is licensed GFDL 1.2.
52
53
        Modifications include:
54
            conversion to a numpy array in place of a list of lists
55
56
        Parameters
57
        ----------
58
        src : str
59
            Source string for comparison
60
        tar : str
61
            Target string for comparison
62
63
        Returns
64
        -------
65
        str
66
            The longest common subsequence
67
68
        Examples
69
        --------
70
        >>> sseq = LCSseq()
71
        >>> sseq.lcsseq('cat', 'hat')
72
        'at'
73
        >>> sseq.lcsseq('Niall', 'Neil')
74
        'Nil'
75
        >>> sseq.lcsseq('aluminum', 'Catalan')
76
        'aln'
77
        >>> sseq.lcsseq('ATCG', 'TAGC')
78
        'AC'
79
80
        """
81 1
        lengths = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int)
82
83
        # row 0 and column 0 are initialized to 0 already
84 1
        for i, src_char in enumerate(src):
85 1
            for j, tar_char in enumerate(tar):
86 1
                if src_char == tar_char:
87 1
                    lengths[i + 1, j + 1] = lengths[i, j] + 1
88
                else:
89 1
                    lengths[i + 1, j + 1] = max(
90
                        lengths[i + 1, j], lengths[i, j + 1]
91
                    )
92
93
        # read the substring out from the matrix
94 1
        result = ''
95 1
        i, j = len(src), len(tar)
96 1
        while i != 0 and j != 0:
97 1
            if lengths[i, j] == lengths[i - 1, j]:
98 1
                i -= 1
99 1
            elif lengths[i, j] == lengths[i, j - 1]:
100 1
                j -= 1
101
            else:
102 1
                result = src[i - 1] + result
103 1
                i -= 1
104 1
                j -= 1
105 1
        return result
106
107 1
    def sim(self, src, tar):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'sim' method
Loading history...
108
        r"""Return the longest common subsequence similarity of two strings.
109
110
        Longest common subsequence similarity (:math:`sim_{LCSseq}`).
111
112
        This employs the LCSseq function to derive a similarity metric:
113
        :math:`sim_{LCSseq}(s,t) = \frac{|LCSseq(s,t)|}{max(|s|, |t|)}`
114
115
        Parameters
116
        ----------
117
        src : str
118
            Source string for comparison
119
        tar : str
120
            Target string for comparison
121
122
        Returns
123
        -------
124
        float
125
            LCSseq similarity
126
127
        Examples
128
        --------
129
        >>> sseq = LCSseq()
130
        >>> sseq.sim('cat', 'hat')
131
        0.6666666666666666
132
        >>> sseq.sim('Niall', 'Neil')
133
        0.6
134
        >>> sseq.sim('aluminum', 'Catalan')
135
        0.375
136
        >>> sseq.sim('ATCG', 'TAGC')
137
        0.5
138
139
        """
140 1
        if src == tar:
141 1
            return 1.0
142 1
        elif not src or not tar:
143 1
            return 0.0
144 1
        return len(self.lcsseq(src, tar)) / max(len(src), len(tar))
145
146
147 1
def lcsseq(src, tar):
148
    """Return the longest common subsequence of two strings.
149
150
    This is a wrapper for :py:meth:`LCSseq.lcsseq`.
151
152
    Parameters
153
    ----------
154
    src : str
155
        Source string for comparison
156
    tar : str
157
        Target string for comparison
158
159
    Returns
160
    -------
161
    str
162
        The longest common subsequence
163
164
    Examples
165
    --------
166
    >>> lcsseq('cat', 'hat')
167
    'at'
168
    >>> lcsseq('Niall', 'Neil')
169
    'Nil'
170
    >>> lcsseq('aluminum', 'Catalan')
171
    'aln'
172
    >>> lcsseq('ATCG', 'TAGC')
173
    'AC'
174
175
    """
176 1
    return LCSseq().lcsseq(src, tar)
177
178
179 1
def sim_lcsseq(src, tar):
180
    r"""Return the longest common subsequence similarity of two strings.
181
182
    This is a wrapper for :py:meth:`LCSseq.sim`.
183
184
    Parameters
185
    ----------
186
    src : str
187
        Source string for comparison
188
    tar : str
189
        Target string for comparison
190
191
    Returns
192
    -------
193
    float
194
        LCSseq similarity
195
196
    Examples
197
    --------
198
    >>> sim_lcsseq('cat', 'hat')
199
    0.6666666666666666
200
    >>> sim_lcsseq('Niall', 'Neil')
201
    0.6
202
    >>> sim_lcsseq('aluminum', 'Catalan')
203
    0.375
204
    >>> sim_lcsseq('ATCG', 'TAGC')
205
    0.5
206
207
    """
208 1
    return LCSseq().sim(src, tar)
209
210
211 1
def dist_lcsseq(src, tar):
212
    """Return the longest common subsequence distance between two strings.
213
214
    This is a wrapper for :py:meth:`LCSseq.dist`.
215
216
    Parameters
217
    ----------
218
    src : str
219
        Source string for comparison
220
    tar : str
221
        Target string for comparison
222
223
    Returns
224
    -------
225
    float
226
        LCSseq distance
227
228
    Examples
229
    --------
230
    >>> dist_lcsseq('cat', 'hat')
231
    0.33333333333333337
232
    >>> dist_lcsseq('Niall', 'Neil')
233
    0.4
234
    >>> dist_lcsseq('aluminum', 'Catalan')
235
    0.625
236
    >>> dist_lcsseq('ATCG', 'TAGC')
237
    0.5
238
239
    """
240 1
    return LCSseq().dist(src, tar)
241
242
243
if __name__ == '__main__':
244
    import doctest
245
246
    doctest.testmod()
247