Completed
Push — master ( 3ac297...afe14d )
by Chris
16:40 queued 07:25
created

abydos.distance._eudex.sim_eudex()   A

Complexity

Conditions 1

Size

Total Lines 34
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 4
dl 0
loc 34
ccs 2
cts 2
cp 1
crap 1
rs 10
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._eudex.
20
21
eudex distance functions
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from types import GeneratorType
32
33 1
from six.moves import range
34
35 1
from ._distance import _Distance
36 1
from ..phonetic import eudex
37
38 1
__all__ = ['Eudex', 'dist_eudex', 'eudex_hamming', 'sim_eudex']
39
40
41 1
class Eudex(_Distance):
42
    """Distance between the Eudex hashes of two terms.
43
44
    Cf. :cite:`Ticki:2016`.
45
    """
46
47 1
    @staticmethod
48
    def gen_fibonacci():
49
        """Yield the next Fibonacci number.
50
51
        Based on https://www.python-course.eu/generators.php
52
        Starts at Fibonacci number 3 (the second 1)
53
54
        Yields
55
        ------
56
        int
57
            The next Fibonacci number
58
59
        """
60 1
        num_a, num_b = 1, 2
61 1
        while True:
62 1
            yield num_a
63 1
            num_a, num_b = num_b, num_a + num_b
64
65 1
    @staticmethod
66 1
    def gen_exponential(base=2):
67
        """Yield the next value in an exponential series of the base.
68
69
        Starts at base**0
70
71
        Parameters
72
        ----------
73
        base : int
74
            The base to exponentiate
75
76
        Yields
77
        ------
78
        int
79
            The next power of `base`
80
81
        """
82 1
        exp = 0
83 1
        while True:
84 1
            yield base ** exp
85 1
            exp += 1
86
87 1
    def dist_abs(
88
        self, src, tar, weights='exponential', max_length=8, normalized=False
89
    ):
90
        """Calculate the distance between the Eudex hashes of two terms.
91
92
        Parameters
93
        ----------
94
        src : str
95
            Source string for comparison
96
        tar : str
97
            Target string for comparison
98
        weights : str, iterable, or generator function
99
            The weights or weights generator function
100
101
                - If set to ``None``, a simple Hamming distance is calculated.
102
                - If set to ``exponential``, weight decays by powers of 2, as
103
                  proposed in the eudex specification:
104
                  https://github.com/ticki/eudex.
105
                - If set to ``fibonacci``, weight decays through the Fibonacci
106
                  series, as in the eudex reference implementation.
107
                - If set to a callable function, this assumes it creates a
108
                  generator and the generator is used to populate a series of
109
                  weights.
110
                - If set to an iterable, the iterable's values should be
111
                  integers and will be used as the weights.
112
113
        max_length : int
114
            The number of characters to encode as a eudex hash
115
        normalized : bool
116
            Normalizes to [0, 1] if True
117
118
        Returns
119
        -------
120
        int
121
            The Eudex Hamming distance
122
123
        Examples
124
        --------
125
        >>> cmp = Eudex()
126
        >>> cmp.dist_abs('cat', 'hat')
127
        128
128
        >>> cmp.dist_abs('Niall', 'Neil')
129
        2
130
        >>> cmp.dist_abs('Colin', 'Cuilen')
131
        10
132
        >>> cmp.dist_abs('ATCG', 'TAGC')
133
        403
134
135
        >>> cmp.dist_abs('cat', 'hat', weights='fibonacci')
136
        34
137
        >>> cmp.dist_abs('Niall', 'Neil', weights='fibonacci')
138
        2
139
        >>> cmp.dist_abs('Colin', 'Cuilen', weights='fibonacci')
140
        7
141
        >>> cmp.dist_abs('ATCG', 'TAGC', weights='fibonacci')
142
        117
143
144
        >>> cmp.dist_abs('cat', 'hat', weights=None)
145
        1
146
        >>> cmp.dist_abs('Niall', 'Neil', weights=None)
147
        1
148
        >>> cmp.dist_abs('Colin', 'Cuilen', weights=None)
149
        2
150
        >>> cmp.dist_abs('ATCG', 'TAGC', weights=None)
151
        9
152
153
        >>> # Using the OEIS A000142:
154
        >>> cmp.dist_abs('cat', 'hat', [1, 1, 2, 6, 24, 120, 720, 5040])
155
        1
156
        >>> cmp.dist_abs('Niall', 'Neil', [1, 1, 2, 6, 24, 120, 720, 5040])
157
        720
158
        >>> cmp.dist_abs('Colin', 'Cuilen',
159
        ... [1, 1, 2, 6, 24, 120, 720, 5040])
160
        744
161
        >>> cmp.dist_abs('ATCG', 'TAGC', [1, 1, 2, 6, 24, 120, 720, 5040])
162
        6243
163
164
        """
165
        # Calculate the eudex hashes and XOR them
166 1
        xored = eudex(src, max_length=max_length) ^ eudex(
167
            tar, max_length=max_length
168
        )
169
170
        # Simple hamming distance (all bits are equal)
171 1
        if not weights:
172 1
            binary = bin(xored)
173 1
            distance = binary.count('1')
174 1
            if normalized:
175 1
                return distance / (len(binary) - 2)
176 1
            return distance
177
178
        # If weights is a function, it should create a generator,
179
        # which we now use to populate a list
180 1
        if callable(weights):
181 1
            weights = weights()
182 1
        elif weights == 'exponential':
183 1
            weights = Eudex.gen_exponential()
184 1
        elif weights == 'fibonacci':
185 1
            weights = Eudex.gen_fibonacci()
186 1
        if isinstance(weights, GeneratorType):
187 1
            weights = [next(weights) for _ in range(max_length)][::-1]
188
189
        # Sum the weighted hamming distance
190 1
        distance = 0
191 1
        max_distance = 0
192 1
        while (xored or normalized) and weights:
193 1
            max_distance += 8 * weights[-1]
194 1
            distance += bin(xored & 0xFF).count('1') * weights.pop()
195 1
            xored >>= 8
196
197 1
        if normalized:
198 1
            distance /= max_distance
199
200 1
        return distance
201
202 1
    def dist(self, src, tar, weights='exponential', max_length=8):
203
        """Return normalized distance between the Eudex hashes of two terms.
204
205
        This is Eudex distance normalized to [0, 1].
206
207
        Parameters
208
        ----------
209
        src : str
210
            Source string for comparison
211
        tar : str
212
            Target string for comparison
213
        weights : str, iterable, or generator function
214
            The weights or weights generator function
215
        max_length : int
216
            The number of characters to encode as a eudex hash
217
218
        Returns
219
        -------
220
        int
221
            The normalized Eudex Hamming distance
222
223
        Examples
224
        --------
225
        >>> cmp = Eudex()
226
        >>> round(cmp.dist('cat', 'hat'), 12)
227
        0.062745098039
228
        >>> round(cmp.dist('Niall', 'Neil'), 12)
229
        0.000980392157
230
        >>> round(cmp.dist('Colin', 'Cuilen'), 12)
231
        0.004901960784
232
        >>> round(cmp.dist('ATCG', 'TAGC'), 12)
233
        0.197549019608
234
235
        """
236 1
        return self.dist_abs(src, tar, weights, max_length, True)
237
238
239 1
def eudex_hamming(
240
    src, tar, weights='exponential', max_length=8, normalized=False
241
):
242
    """Calculate the Hamming distance between the Eudex hashes of two terms.
243
244
    This is a wrapper for :py:meth:`Eudex.eudex_hamming`.
245
246
    Parameters
247
    ----------
248
    src : str
249
        Source string for comparison
250
    tar : str
251
        Target string for comparison
252
    weights : str, iterable, or generator function
253
        The weights or weights generator function
254
    max_length : int
255
        The number of characters to encode as a eudex hash
256
    normalized : bool
257
        Normalizes to [0, 1] if True
258
259
    Returns
260
    -------
261
    int
262
        The Eudex Hamming distance
263
264
    Examples
265
    --------
266
    >>> eudex_hamming('cat', 'hat')
267
    128
268
    >>> eudex_hamming('Niall', 'Neil')
269
    2
270
    >>> eudex_hamming('Colin', 'Cuilen')
271
    10
272
    >>> eudex_hamming('ATCG', 'TAGC')
273
    403
274
275
    >>> eudex_hamming('cat', 'hat', weights='fibonacci')
276
    34
277
    >>> eudex_hamming('Niall', 'Neil', weights='fibonacci')
278
    2
279
    >>> eudex_hamming('Colin', 'Cuilen', weights='fibonacci')
280
    7
281
    >>> eudex_hamming('ATCG', 'TAGC', weights='fibonacci')
282
    117
283
284
    >>> eudex_hamming('cat', 'hat', weights=None)
285
    1
286
    >>> eudex_hamming('Niall', 'Neil', weights=None)
287
    1
288
    >>> eudex_hamming('Colin', 'Cuilen', weights=None)
289
    2
290
    >>> eudex_hamming('ATCG', 'TAGC', weights=None)
291
    9
292
293
    >>> # Using the OEIS A000142:
294
    >>> eudex_hamming('cat', 'hat', [1, 1, 2, 6, 24, 120, 720, 5040])
295
    1
296
    >>> eudex_hamming('Niall', 'Neil', [1, 1, 2, 6, 24, 120, 720, 5040])
297
    720
298
    >>> eudex_hamming('Colin', 'Cuilen', [1, 1, 2, 6, 24, 120, 720, 5040])
299
    744
300
    >>> eudex_hamming('ATCG', 'TAGC', [1, 1, 2, 6, 24, 120, 720, 5040])
301
    6243
302
303
    """
304 1
    return Eudex().dist_abs(src, tar, weights, max_length, normalized)
305
306
307 1
def dist_eudex(src, tar, weights='exponential', max_length=8):
308
    """Return normalized Hamming distance between Eudex hashes of two terms.
309
310
    This is a wrapper for :py:meth:`Eudex.dist`.
311
312
    Parameters
313
    ----------
314
    src : str
315
        Source string for comparison
316
    tar : str
317
        Target string for comparison
318
    weights : str, iterable, or generator function
319
        The weights or weights generator function
320
    max_length : int
321
        The number of characters to encode as a eudex hash
322
323
    Returns
324
    -------
325
    int
326
        The normalized Eudex Hamming distance
327
328
    Examples
329
    --------
330
    >>> round(dist_eudex('cat', 'hat'), 12)
331
    0.062745098039
332
    >>> round(dist_eudex('Niall', 'Neil'), 12)
333
    0.000980392157
334
    >>> round(dist_eudex('Colin', 'Cuilen'), 12)
335
    0.004901960784
336
    >>> round(dist_eudex('ATCG', 'TAGC'), 12)
337
    0.197549019608
338
339
    """
340 1
    return Eudex().dist(src, tar, weights, max_length)
341
342
343 1
def sim_eudex(src, tar, weights='exponential', max_length=8):
344
    """Return normalized Hamming similarity between Eudex hashes of two terms.
345
346
    This is a wrapper for :py:meth:`Eudex.sim`.
347
348
    Parameters
349
    ----------
350
    src : str
351
        Source string for comparison
352
    tar : str
353
        Target string for comparison
354
    weights : str, iterable, or generator function
355
        The weights or weights generator function
356
    max_length : int
357
        The number of characters to encode as a eudex hash
358
359
    Returns
360
    -------
361
    int
362
        The normalized Eudex Hamming similarity
363
364
    Examples
365
    --------
366
    >>> round(sim_eudex('cat', 'hat'), 12)
367
    0.937254901961
368
    >>> round(sim_eudex('Niall', 'Neil'), 12)
369
    0.999019607843
370
    >>> round(sim_eudex('Colin', 'Cuilen'), 12)
371
    0.995098039216
372
    >>> round(sim_eudex('ATCG', 'TAGC'), 12)
373
    0.802450980392
374
375
    """
376 1
    return Eudex().sim(src, tar, weights, max_length)
377
378
379
if __name__ == '__main__':
380
    import doctest
381
382
    doctest.testmod()
383