Completed
Push — master ( f43547...71985b )
by Chris
12:00 queued 10s
created

abydos.distance._eudex.Eudex.gen_fibonacci()   A

Complexity

Conditions 2

Size

Total Lines 17
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 2

Importance

Changes 0
Metric Value
cc 2
eloc 6
nop 0
dl 0
loc 17
ccs 5
cts 5
cp 1
crap 2
rs 10
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._eudex.
20
21
eudex distance functions
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from types import GeneratorType
32
33 1
from six.moves import range
34
35 1
from ._distance import _Distance
36 1
from ..phonetic import eudex
37
38 1
__all__ = ['Eudex', 'dist_eudex', 'eudex_hamming', 'sim_eudex']
39
40
41 1
class Eudex(_Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
42
    """Distance between the Eudex hashes of two terms.
43
44
    Cf. :cite:`Ticki:2016`.
45
    """
46
47 1
    @staticmethod
48
    def gen_fibonacci():
49
        """Yield the next Fibonacci number.
50
51
        Based on https://www.python-course.eu/generators.php
52
        Starts at Fibonacci number 3 (the second 1)
53
54
        Yields
55
        ------
56
        int
57
            The next Fibonacci number
58
59
        """
60 1
        num_a, num_b = 1, 2
61 1
        while True:
62 1
            yield num_a
63 1
            num_a, num_b = num_b, num_a + num_b
64
65 1
    @staticmethod
66 1
    def gen_exponential(base=2):
67
        """Yield the next value in an exponential series of the base.
68
69
        Starts at base**0
70
71
        Parameters
72
        ----------
73
        base : int
74
            The base to exponentiate
75
76
        Yields
77
        ------
78
        int
79
            The next power of `base`
80
81
        """
82 1
        exp = 0
83 1
        while True:
84 1
            yield base ** exp
85 1
            exp += 1
86
87 1
    def dist_abs(
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
Bug introduced by
Parameters differ from overridden 'dist_abs' method
Loading history...
88
        self, src, tar, weights='exponential', max_length=8, normalized=False
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
89
    ):
90
        """Calculate the distance between the Eudex hashes of two terms.
91
92
        Parameters
93
        ----------
94
        src : str
95
            Source string for comparison
96
        tar : str
97
            Target string for comparison
98
        weights : str, iterable, or generator function
99
            The weights or weights generator function
100
101
                - If set to ``None``, a simple Hamming distance is calculated.
102
                - If set to ``exponential``, weight decays by powers of 2, as
103
                  proposed in the eudex specification:
104
                  https://github.com/ticki/eudex.
105
                - If set to ``fibonacci``, weight decays through the Fibonacci
106
                  series, as in the eudex reference implementation.
107
                - If set to a callable function, this assumes it creates a
108
                  generator and the generator is used to populate a series of
109
                  weights.
110
                - If set to an iterable, the iterable's values should be
111
                  integers and will be used as the weights.
112
113
        max_length : int
114
            The number of characters to encode as a eudex hash
115
        normalized : bool
116
            Normalizes to [0, 1] if True
117
118
        Returns
119
        -------
120
        int
121
            The Eudex Hamming distance
122
123
        Examples
124
        --------
125
        >>> cmp = Eudex()
126
        >>> cmp.dist_abs('cat', 'hat')
127
        128
128
        >>> cmp.dist_abs('Niall', 'Neil')
129
        2
130
        >>> cmp.dist_abs('Colin', 'Cuilen')
131
        10
132
        >>> cmp.dist_abs('ATCG', 'TAGC')
133
        403
134
135
        >>> cmp.dist_abs('cat', 'hat', weights='fibonacci')
136
        34
137
        >>> cmp.dist_abs('Niall', 'Neil', weights='fibonacci')
138
        2
139
        >>> cmp.dist_abs('Colin', 'Cuilen', weights='fibonacci')
140
        7
141
        >>> cmp.dist_abs('ATCG', 'TAGC', weights='fibonacci')
142
        117
143
144
        >>> cmp.dist_abs('cat', 'hat', weights=None)
145
        1
146
        >>> cmp.dist_abs('Niall', 'Neil', weights=None)
147
        1
148
        >>> cmp.dist_abs('Colin', 'Cuilen', weights=None)
149
        2
150
        >>> cmp.dist_abs('ATCG', 'TAGC', weights=None)
151
        9
152
153
        >>> # Using the OEIS A000142:
154
        >>> cmp.dist_abs('cat', 'hat', [1, 1, 2, 6, 24, 120, 720, 5040])
155
        1
156
        >>> cmp.dist_abs('Niall', 'Neil', [1, 1, 2, 6, 24, 120, 720, 5040])
157
        720
158
        >>> cmp.dist_abs('Colin', 'Cuilen',
159
        ... [1, 1, 2, 6, 24, 120, 720, 5040])
160
        744
161
        >>> cmp.dist_abs('ATCG', 'TAGC', [1, 1, 2, 6, 24, 120, 720, 5040])
162
        6243
163
164
        """
165
        # Calculate the eudex hashes and XOR them
166 1
        xored = eudex(src, max_length=max_length) ^ eudex(
167
            tar, max_length=max_length
168
        )
169
170
        # Simple hamming distance (all bits are equal)
171 1
        if not weights:
172 1
            binary = bin(xored)
173 1
            distance = binary.count('1')
174 1
            if normalized:
175 1
                return distance / (len(binary) - 2)
176 1
            return distance
177
178
        # If weights is a function, it should create a generator,
179
        # which we now use to populate a list
180 1
        if callable(weights):
181 1
            weights = weights()
182 1
        elif weights == 'exponential':
183 1
            weights = Eudex.gen_exponential()
184 1
        elif weights == 'fibonacci':
185 1
            weights = Eudex.gen_fibonacci()
186 1
        if isinstance(weights, GeneratorType):
187 1
            weights = [next(weights) for _ in range(max_length)][::-1]
188
189
        # Sum the weighted hamming distance
190 1
        distance = 0
191 1
        max_distance = 0
192 1
        while (xored or normalized) and weights:
193 1
            max_distance += 8 * weights[-1]
194 1
            distance += bin(xored & 0xFF).count('1') * weights.pop()
195 1
            xored >>= 8
196
197 1
        if normalized:
198 1
            distance /= max_distance
199
200 1
        return distance
201
202 1
    def dist(self, src, tar, weights='exponential', max_length=8):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist' method
Loading history...
203
        """Return normalized distance between the Eudex hashes of two terms.
204
205
        This is Eudex distance normalized to [0, 1].
206
207
        Parameters
208
        ----------
209
        src : str
210
            Source string for comparison
211
        tar : str
212
            Target string for comparison
213
        weights : str, iterable, or generator function
214
            The weights or weights generator function
215
        max_length : int
216
            The number of characters to encode as a eudex hash
217
218
        Returns
219
        -------
220
        int
221
            The normalized Eudex Hamming distance
222
223
        Examples
224
        --------
225
        >>> cmp = Eudex()
226
        >>> round(cmp.dist('cat', 'hat'), 12)
227
        0.062745098039
228
        >>> round(cmp.dist('Niall', 'Neil'), 12)
229
        0.000980392157
230
        >>> round(cmp.dist('Colin', 'Cuilen'), 12)
231
        0.004901960784
232
        >>> round(cmp.dist('ATCG', 'TAGC'), 12)
233
        0.197549019608
234
235
        """
236 1
        return self.dist_abs(src, tar, weights, max_length, True)
237
238
239 1
def eudex_hamming(
240
    src, tar, weights='exponential', max_length=8, normalized=False
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
241
):
242
    """Calculate the Hamming distance between the Eudex hashes of two terms.
243
244
    This is a wrapper for :py:meth:`Eudex.eudex_hamming`.
245
246
    Parameters
247
    ----------
248
    src : str
249
        Source string for comparison
250
    tar : str
251
        Target string for comparison
252
    weights : str, iterable, or generator function
253
        The weights or weights generator function
254
    max_length : int
255
        The number of characters to encode as a eudex hash
256
    normalized : bool
257
        Normalizes to [0, 1] if True
258
259
    Returns
260
    -------
261
    int
262
        The Eudex Hamming distance
263
264
    Examples
265
    --------
266
    >>> eudex_hamming('cat', 'hat')
267
    128
268
    >>> eudex_hamming('Niall', 'Neil')
269
    2
270
    >>> eudex_hamming('Colin', 'Cuilen')
271
    10
272
    >>> eudex_hamming('ATCG', 'TAGC')
273
    403
274
275
    >>> eudex_hamming('cat', 'hat', weights='fibonacci')
276
    34
277
    >>> eudex_hamming('Niall', 'Neil', weights='fibonacci')
278
    2
279
    >>> eudex_hamming('Colin', 'Cuilen', weights='fibonacci')
280
    7
281
    >>> eudex_hamming('ATCG', 'TAGC', weights='fibonacci')
282
    117
283
284
    >>> eudex_hamming('cat', 'hat', weights=None)
285
    1
286
    >>> eudex_hamming('Niall', 'Neil', weights=None)
287
    1
288
    >>> eudex_hamming('Colin', 'Cuilen', weights=None)
289
    2
290
    >>> eudex_hamming('ATCG', 'TAGC', weights=None)
291
    9
292
293
    >>> # Using the OEIS A000142:
294
    >>> eudex_hamming('cat', 'hat', [1, 1, 2, 6, 24, 120, 720, 5040])
295
    1
296
    >>> eudex_hamming('Niall', 'Neil', [1, 1, 2, 6, 24, 120, 720, 5040])
297
    720
298
    >>> eudex_hamming('Colin', 'Cuilen', [1, 1, 2, 6, 24, 120, 720, 5040])
299
    744
300
    >>> eudex_hamming('ATCG', 'TAGC', [1, 1, 2, 6, 24, 120, 720, 5040])
301
    6243
302
303
    """
304 1
    return Eudex().dist_abs(src, tar, weights, max_length, normalized)
305
306
307 1
def dist_eudex(src, tar, weights='exponential', max_length=8):
308
    """Return normalized Hamming distance between Eudex hashes of two terms.
309
310
    This is a wrapper for :py:meth:`Eudex.dist`.
311
312
    Parameters
313
    ----------
314
    src : str
315
        Source string for comparison
316
    tar : str
317
        Target string for comparison
318
    weights : str, iterable, or generator function
319
        The weights or weights generator function
320
    max_length : int
321
        The number of characters to encode as a eudex hash
322
323
    Returns
324
    -------
325
    int
326
        The normalized Eudex Hamming distance
327
328
    Examples
329
    --------
330
    >>> round(dist_eudex('cat', 'hat'), 12)
331
    0.062745098039
332
    >>> round(dist_eudex('Niall', 'Neil'), 12)
333
    0.000980392157
334
    >>> round(dist_eudex('Colin', 'Cuilen'), 12)
335
    0.004901960784
336
    >>> round(dist_eudex('ATCG', 'TAGC'), 12)
337
    0.197549019608
338
339
    """
340 1
    return Eudex().dist(src, tar, weights, max_length)
341
342
343 1
def sim_eudex(src, tar, weights='exponential', max_length=8):
344
    """Return normalized Hamming similarity between Eudex hashes of two terms.
345
346
    This is a wrapper for :py:meth:`Eudex.sim`.
347
348
    Parameters
349
    ----------
350
    src : str
351
        Source string for comparison
352
    tar : str
353
        Target string for comparison
354
    weights : str, iterable, or generator function
355
        The weights or weights generator function
356
    max_length : int
357
        The number of characters to encode as a eudex hash
358
359
    Returns
360
    -------
361
    int
362
        The normalized Eudex Hamming similarity
363
364
    Examples
365
    --------
366
    >>> round(sim_eudex('cat', 'hat'), 12)
367
    0.937254901961
368
    >>> round(sim_eudex('Niall', 'Neil'), 12)
369
    0.999019607843
370
    >>> round(sim_eudex('Colin', 'Cuilen'), 12)
371
    0.995098039216
372
    >>> round(sim_eudex('ATCG', 'TAGC'), 12)
373
    0.802450980392
374
375
    """
376 1
    return Eudex().sim(src, tar, weights, max_length)
377
378
379
if __name__ == '__main__':
380
    import doctest
381
382
    doctest.testmod()
383