Passed
Push — master ( d2a11f...643512 )
by Chris
01:59 queued 12s
created

abydos.distance._relaxed_hamming   A

Complexity

Total Complexity 20

Size/Duplication

Total Lines 209
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
wmc 20
eloc 67
dl 0
loc 209
ccs 55
cts 55
cp 1
rs 10
c 0
b 0
f 0

3 Methods

Rating   Name   Duplication   Size   Complexity  
F RelaxedHamming.dist_abs() 0 80 15
A RelaxedHamming.dist() 0 40 3
A RelaxedHamming.__init__() 0 35 2
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2019 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._discounted_hamming.
20
21
Relaxed Hamming distance
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from ._distance import _Distance
32 1
from ..tokenizer import QGrams
33
34 1
__all__ = ['RelaxedHamming']
35
36
37 1
class RelaxedHamming(_Distance):
38
    """Relaxed Hamming distance.
39
40
    This is a variant of Hamming distance in which positionally close matches
41
    are considered partially matching.
42
43
    .. versionadded:: 0.4.1
44
    """
45
46 1
    def __init__(self, tokenizer=None, maxdist=2, discount=0.2, **kwargs):
47
        """Initialize DiscountedHamming instance.
48
49
        Parameters
50
        ----------
51
        tokenizer : _Tokenizer
52
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
53
        maxdist : int
54
            The maximum distance to consider for discounting.
55
        discount : float
56
            The discount factor multiplied by the distance from the source
57
            string position.
58
        **kwargs
59
            Arbitrary keyword arguments
60
61
        Other Parameters
62
        ----------------
63
        qval : int
64
            The length of each q-gram. Using this parameter and tokenizer=None
65
            will cause the instance to use the QGram tokenizer with this
66
            q value.
67
68
69
        .. versionadded:: 0.4.1
70
71
        """
72 1
        super(RelaxedHamming, self).__init__(**kwargs)
73
74 1
        self.params['tokenizer'] = tokenizer
75 1
        if 'qval' in self.params:
76 1
            self.params['tokenizer'] = QGrams(
77
                qval=self.params['qval'], start_stop='$#', skip=0, scaler=None
78
            )
79 1
        self._maxdist = maxdist
80 1
        self._discount = discount
81
82 1
    def dist_abs(self, src, tar):
83
        """Return the discounted Hamming distance between two strings.
84
85
        Parameters
86
        ----------
87
        src : str
88
            Source string for comparison
89
        tar : str
90
            Target string for comparison
91
92
        Returns
93
        -------
94
        float
95
            Relaxed Hamming distance
96
97
        Examples
98
        --------
99
        >>> cmp = RelaxedHamming()
100
        >>> cmp.dist_abs('cat', 'hat')
101
        1.0
102
        >>> cmp.dist_abs('Niall', 'Neil')
103
        1.4
104
        >>> cmp.dist_abs('aluminum', 'Catalan')
105
        6.4
106
        >>> cmp.dist_abs('ATCG', 'TAGC')
107
        0.8
108
109
110
        .. versionadded:: 0.4.1
111
112
        """
113 1
        if src == tar:
114 1
            return 0
115
116 1
        if len(src) != len(tar):
117 1
            replacement_char = 1
118 1
            while chr(replacement_char) in src or chr(replacement_char) in tar:
119 1
                replacement_char += 1
120 1
            replacement_char = chr(replacement_char)
121 1
            if len(src) < len(tar):
122 1
                src += replacement_char * (len(tar) - len(src))
123
            else:
124 1
                tar += replacement_char * (len(src) - len(tar))
125
126 1
        if self.params['tokenizer']:
127 1
            src = self.params['tokenizer'].tokenize(src).get_list()
128 1
            tar = self.params['tokenizer'].tokenize(tar).get_list()
129
130 1
        score = 0
131 1
        for pos in range(len(src)):
132 1
            if src[pos] == tar[pos : pos + 1][0]:
133 1
                continue
134
135 1
            try:
136 1
                diff = (
137
                    tar[pos + 1 : pos + self._maxdist + 1].index(src[pos]) + 1
138
                )
139 1
            except ValueError:
140 1
                diff = 0
141 1
            try:
142 1
                found = (
143
                    tar[max(0, pos - self._maxdist) : pos][::-1].index(
144
                        src[pos]
145
                    )
146
                    + 1
147
                )
148 1
            except ValueError:
149 1
                found = 0
150
151 1
            if found and diff:
152 1
                diff = min(diff, found)
153 1
            elif found:
154 1
                diff = found
155
156 1
            if diff:
157 1
                score += min(1.0, self._discount * diff)
158
            else:
159 1
                score += 1.0
160
161 1
        return score
162
163 1
    def dist(self, src, tar):
164
        """Return the normalized relaxed Hamming distance between strings.
165
166
        Parameters
167
        ----------
168
        src : str
169
            Source string for comparison
170
        tar : str
171
            Target string for comparison
172
173
        Returns
174
        -------
175
        float
176
            Normalized relaxed Hamming distance
177
178
        Examples
179
        --------
180
        >>> cmp = RelaxedHamming()
181
        >>> round(cmp.dist('cat', 'hat'), 12)
182
        0.333333333333
183
        >>> cmp.dist('Niall', 'Neil')
184
        0.27999999999999997
185
        >>> cmp.dist('aluminum', 'Catalan')
186
        0.8
187
        >>> cmp.dist('ATCG', 'TAGC')
188
        0.2
189
190
191
        .. versionadded:: 0.4.1
192
193
        """
194 1
        if src == tar:
195 1
            return 0.0
196 1
        score = self.dist_abs(src, tar)
197
198 1
        if self.params['tokenizer']:
199 1
            src = self.params['tokenizer'].tokenize(src).get_list()
200 1
            tar = self.params['tokenizer'].tokenize(tar).get_list()
201
202 1
        return score / max(len(src), len(tar))
203
204
205
if __name__ == '__main__':
206
    import doctest
207
208
    doctest.testmod()
209