Completed
Push — master ( f43547...71985b )
by Chris
12:00 queued 10s
created

RatcliffObershelp.sim()   C

Complexity

Conditions 9

Size

Total Lines 103
Code Lines 28

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 24
CRAP Score 9

Importance

Changes 0
Metric Value
cc 9
eloc 28
nop 3
dl 0
loc 103
ccs 24
cts 24
cp 1
crap 9
rs 6.6666
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._ratcliff_obershelp.
20
21
Ratcliff-Obershelp similarity
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from numpy import int as np_int
32 1
from numpy import zeros as np_zeros
33
34 1
from six.moves import range
35
36 1
from ._distance import _Distance
37
38 1
__all__ = [
39
    'RatcliffObershelp',
40
    'dist_ratcliff_obershelp',
41
    'sim_ratcliff_obershelp',
42
]
43
44
45 1
class RatcliffObershelp(_Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
46
    """Ratcliff-Obershelp similarity.
47
48
    This follows the Ratcliff-Obershelp algorithm :cite:`Ratcliff:1988` to
49
    derive a similarity measure:
50
51
        1. Find the length of the longest common substring in src & tar.
52
        2. Recurse on the strings to the left & right of each this substring
53
           in src & tar. The base case is a 0 length common substring, in which
54
           case, return 0. Otherwise, return the sum of the current longest
55
           common substring and the left & right recursed sums.
56
        3. Multiply this length by 2 and divide by the sum of the lengths of
57
           src & tar.
58
59
    Cf.
60
    http://www.drdobbs.com/database/pattern-matching-the-gestalt-approach/184407970
61
    """
62
63 1
    def sim(self, src, tar):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'sim' method
Loading history...
64
        """Return the Ratcliff-Obershelp similarity of two strings.
65
66
        Parameters
67
        ----------
68
        src : str
69
            Source string for comparison
70
        tar : str
71
            Target string for comparison
72
73
        Returns
74
        -------
75
        float
76
            Ratcliff-Obershelp similarity
77
78
        Examples
79
        --------
80
        >>> cmp = RatcliffObershelp()
81
        >>> round(cmp.sim('cat', 'hat'), 12)
82
        0.666666666667
83
        >>> round(cmp.sim('Niall', 'Neil'), 12)
84
        0.666666666667
85
        >>> round(cmp.sim('aluminum', 'Catalan'), 12)
86
        0.4
87
        >>> cmp.sim('ATCG', 'TAGC')
88
        0.5
89
90
        """
91
92 1
        def _lcsstr_stl(src, tar):
93
            """Return start positions & length for Ratcliff-Obershelp.
94
95
            Parameters
96
            ----------
97
            src : str
98
                Source string for comparison
99
            tar : str
100
            Target string for comparison
101
102
            Returns
103
            -------
104
            tuple
105
                The start position in the source string, start position in the
106
                target string, and length of the longest common substring of
107
                strings src and tar.
108
109
            """
110 1
            lengths = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int)
111 1
            longest, src_longest, tar_longest = 0, 0, 0
112 1
            for i in range(1, len(src) + 1):
113 1
                for j in range(1, len(tar) + 1):
114 1
                    if src[i - 1] == tar[j - 1]:
115 1
                        lengths[i, j] = lengths[i - 1, j - 1] + 1
116 1
                        if lengths[i, j] > longest:
117 1
                            longest = lengths[i, j]
118 1
                            src_longest = i
119 1
                            tar_longest = j
120
                    else:
121 1
                        lengths[i, j] = 0
122 1
            return src_longest - longest, tar_longest - longest, longest
123
124 1
        def _sstr_matches(src, tar):
125
            """Return the sum of substring match lengths.
126
127
            This follows the Ratcliff-Obershelp algorithm
128
            :cite:`Ratcliff:1988`:
129
                 1. Find the length of the longest common substring in src &
130
                     tar.
131
                 2. Recurse on the strings to the left & right of each this
132
                     substring in src & tar.
133
                 3. Base case is a 0 length common substring, in which case,
134
                     return 0.
135
                 4. Return the sum.
136
137
            Parameters
138
            ----------
139
            src : str
140
                Source string for comparison
141
            tar : str
142
                Target string for comparison
143
144
            Returns
145
            -------
146
            int
147
                Sum of substring match lengths
148
149
            """
150 1
            src_start, tar_start, length = _lcsstr_stl(src, tar)
151 1
            if length == 0:
152 1
                return 0
153 1
            return (
154
                _sstr_matches(src[:src_start], tar[:tar_start])
155
                + length
156
                + _sstr_matches(
157
                    src[src_start + length :], tar[tar_start + length :]
158
                )
159
            )
160
161 1
        if src == tar:
162 1
            return 1.0
163 1
        elif not src or not tar:
164 1
            return 0.0
165 1
        return 2 * _sstr_matches(src, tar) / (len(src) + len(tar))
166
167
168 1
def sim_ratcliff_obershelp(src, tar):
169
    """Return the Ratcliff-Obershelp similarity of two strings.
170
171
    This is a wrapper for :py:meth:`RatcliffObershelp.sim`.
172
173
    Parameters
174
    ----------
175
    src : str
176
        Source string for comparison
177
    tar : str
178
        Target string for comparison
179
180
    Returns
181
    -------
182
    float
183
        Ratcliff-Obershelp similarity
184
185
    Examples
186
    --------
187
    >>> round(sim_ratcliff_obershelp('cat', 'hat'), 12)
188
    0.666666666667
189
    >>> round(sim_ratcliff_obershelp('Niall', 'Neil'), 12)
190
    0.666666666667
191
    >>> round(sim_ratcliff_obershelp('aluminum', 'Catalan'), 12)
192
    0.4
193
    >>> sim_ratcliff_obershelp('ATCG', 'TAGC')
194
    0.5
195
196
    """
197 1
    return RatcliffObershelp().sim(src, tar)
198
199
200 1
def dist_ratcliff_obershelp(src, tar):
201
    """Return the Ratcliff-Obershelp distance between two strings.
202
203
    This is a wrapper for :py:meth:`RatcliffObershelp.dist`.
204
205
    Parameters
206
    ----------
207
    src : str
208
        Source string for comparison
209
    tar : str
210
        Target string for comparison
211
212
    Returns
213
    -------
214
    float
215
        Ratcliff-Obershelp distance
216
217
    Examples
218
    --------
219
    >>> round(dist_ratcliff_obershelp('cat', 'hat'), 12)
220
    0.333333333333
221
    >>> round(dist_ratcliff_obershelp('Niall', 'Neil'), 12)
222
    0.333333333333
223
    >>> round(dist_ratcliff_obershelp('aluminum', 'Catalan'), 12)
224
    0.6
225
    >>> dist_ratcliff_obershelp('ATCG', 'TAGC')
226
    0.5
227
228
    """
229 1
    return RatcliffObershelp().dist(src, tar)
230
231
232
if __name__ == '__main__':
233
    import doctest
234
235
    doctest.testmod()
236