Completed
Branch master (78a222)
by Chris
14:36
created

abydos.distance._minkowski   A

Complexity

Total Complexity 17

Size/Duplication

Total Lines 341
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
wmc 17
eloc 55
dl 0
loc 341
ccs 41
cts 41
cp 1
rs 10
c 0
b 0
f 0

10 Functions

Rating   Name   Duplication   Size   Complexity  
A chebyshev() 0 28 1
A sim_manhattan() 0 23 1
A sim_minkowski() 0 24 1
A dist_euclidean() 0 23 1
A manhattan() 0 24 1
A sim_euclidean() 0 23 1
A dist_minkowski() 0 24 1
A dist_manhattan() 0 25 1
B minkowski() 0 49 8
A euclidean() 0 24 1
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance.minkowski.
20
21
The distance.minkowski module implements Minkowski token-based distances:
22
23
    - Minkowski distance & similarity
24
    - Manhattan distance & similarity
25
    - Euclidean distance & similarity
26
    - Chebyshev distance
27
"""
28
29 1
from __future__ import division, unicode_literals
30
31 1
from numbers import Number
32
33 1
from ._util import _get_qgrams
34
35
36 1
__all__ = [
37
    'chebyshev',
38
    'dist_euclidean',
39
    'dist_manhattan',
40
    'dist_minkowski',
41
    'euclidean',
42
    'manhattan',
43
    'minkowski',
44
    'sim_euclidean',
45
    'sim_manhattan',
46
    'sim_minkowski',
47
]
48
49
50 1
def minkowski(src, tar, qval=2, pval=1, normalized=False, alphabet=None):
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
51
    """Return the Minkowski distance (:math:`L^p-norm`) of two strings.
52
53
    The Minkowski distance :cite:`Minkowski:1910` is a distance metric in
54
    :math:`L^p-space`.
55
56
    :param str src: source string (or QGrams/Counter objects) for comparison
57
    :param str tar: target string (or QGrams/Counter objects) for comparison
58
    :param int qval: the length of each q-gram; 0 for non-q-gram version
59
    :param int or float pval: the :math:`p`-value of the :math:`L^p`-space.
60
    :param bool normalized: normalizes to [0, 1] if True
61
    :param collection or int alphabet: the values or size of the alphabet
62
    :returns: the Minkowski distance
63
    :rtype: float
64
65
    >>> minkowski('cat', 'hat')
66
    4.0
67
    >>> minkowski('Niall', 'Neil')
68
    7.0
69
    >>> minkowski('Colin', 'Cuilen')
70
    9.0
71
    >>> minkowski('ATCG', 'TAGC')
72
    10.0
73
    """
74 1
    q_src, q_tar = _get_qgrams(src, tar, qval)
75 1
    diffs = ((q_src - q_tar) + (q_tar - q_src)).values()
76
77 1
    normalizer = 1
78 1
    if normalized:
79 1
        totals = (q_src + q_tar).values()
80 1
        if alphabet is not None:
81
            # noinspection PyTypeChecker
82 1
            normalizer = (
83
                alphabet if isinstance(alphabet, Number) else len(alphabet)
84
            )
85 1
        elif pval == 0:
86 1
            normalizer = len(totals)
87
        else:
88 1
            normalizer = sum(_ ** pval for _ in totals) ** (1 / pval)
89
90 1
    if len(diffs) == 0:
0 ignored issues
show
Unused Code introduced by
Do not use len(SEQUENCE) as condition value
Loading history...
91 1
        return 0.0
92 1
    if pval == float('inf'):
93
        # Chebyshev distance
94 1
        return max(diffs) / normalizer
95 1
    if pval == 0:
96
        # This is the l_0 "norm" as developed by David Donoho
97 1
        return len(diffs) / normalizer
98 1
    return sum(_ ** pval for _ in diffs) ** (1 / pval) / normalizer
99
100
101 1
def dist_minkowski(src, tar, qval=2, pval=1, alphabet=None):
102
    """Return normalized Minkowski distance of two strings.
103
104
    The normalized Minkowski distance :cite:`Minkowski:1910` is a distance
105
    metric in :math:`L^p-space`, normalized to [0, 1].
106
107
    :param str src: source string (or QGrams/Counter objects) for comparison
108
    :param str tar: target string (or QGrams/Counter objects) for comparison
109
    :param int qval: the length of each q-gram; 0 for non-q-gram version
110
    :param int or float pval: the :math:`p`-value of the :math:`L^p`-space.
111
    :param collection or int alphabet: the values or size of the alphabet
112
    :returns: the normalized Minkowski distance
113
    :rtype: float
114
115
    >>> dist_minkowski('cat', 'hat')
116
    0.5
117
    >>> round(dist_minkowski('Niall', 'Neil'), 12)
118
    0.636363636364
119
    >>> round(dist_minkowski('Colin', 'Cuilen'), 12)
120
    0.692307692308
121
    >>> dist_minkowski('ATCG', 'TAGC')
122
    1.0
123
    """
124 1
    return minkowski(src, tar, qval, pval, True, alphabet)
125
126
127 1
def sim_minkowski(src, tar, qval=2, pval=1, alphabet=None):
128
    """Return normalized Minkowski similarity of two strings.
129
130
    Minkowski similarity is the complement of Minkowski distance:
131
    :math:`sim_{Minkowski} = 1 - dist_{Minkowski}`.
132
133
    :param str src: source string (or QGrams/Counter objects) for comparison
134
    :param str tar: target string (or QGrams/Counter objects) for comparison
135
    :param int qval: the length of each q-gram; 0 for non-q-gram version
136
    :param int or float pval: the :math:`p`-value of the :math:`L^p`-space.
137
    :param collection or int alphabet: the values or size of the alphabet
138
    :returns: the normalized Minkowski similarity
139
    :rtype: float
140
141
    >>> sim_minkowski('cat', 'hat')
142
    0.5
143
    >>> round(sim_minkowski('Niall', 'Neil'), 12)
144
    0.363636363636
145
    >>> round(sim_minkowski('Colin', 'Cuilen'), 12)
146
    0.307692307692
147
    >>> sim_minkowski('ATCG', 'TAGC')
148
    0.0
149
    """
150 1
    return 1 - minkowski(src, tar, qval, pval, True, alphabet)
151
152
153 1
def manhattan(src, tar, qval=2, normalized=False, alphabet=None):
154
    """Return the Manhattan distance between two strings.
155
156
    Manhattan distance is the city-block or taxi-cab distance, equivalent
157
    to Minkowski distance in :math:`L^1`-space.
158
159
    :param str src: source string (or QGrams/Counter objects) for comparison
160
    :param str tar: target string (or QGrams/Counter objects) for comparison
161
    :param int qval: the length of each q-gram; 0 for non-q-gram version
162
    :param normalized: normalizes to [0, 1] if True
163
    :param collection or int alphabet: the values or size of the alphabet
164
    :returns: the Manhattan distance
165
    :rtype: float
166
167
    >>> manhattan('cat', 'hat')
168
    4.0
169
    >>> manhattan('Niall', 'Neil')
170
    7.0
171
    >>> manhattan('Colin', 'Cuilen')
172
    9.0
173
    >>> manhattan('ATCG', 'TAGC')
174
    10.0
175
    """
176 1
    return minkowski(src, tar, qval, 1, normalized, alphabet)
177
178
179 1
def dist_manhattan(src, tar, qval=2, alphabet=None):
180
    """Return the normalized Manhattan distance between two strings.
181
182
    The normalized Manhattan distance is a distance
183
    metric in :math:`L^1-space`, normalized to [0, 1].
184
185
    This is identical to Canberra distance.
186
187
    :param str src: source string (or QGrams/Counter objects) for comparison
188
    :param str tar: target string (or QGrams/Counter objects) for comparison
189
    :param int qval: the length of each q-gram; 0 for non-q-gram version
190
    :param collection or int alphabet: the values or size of the alphabet
191
    :returns: the normalized Manhattan distance
192
    :rtype: float
193
194
    >>> dist_manhattan('cat', 'hat')
195
    0.5
196
    >>> round(dist_manhattan('Niall', 'Neil'), 12)
197
    0.636363636364
198
    >>> round(dist_manhattan('Colin', 'Cuilen'), 12)
199
    0.692307692308
200
    >>> dist_manhattan('ATCG', 'TAGC')
201
    1.0
202
    """
203 1
    return manhattan(src, tar, qval, True, alphabet)
204
205
206 1
def sim_manhattan(src, tar, qval=2, alphabet=None):
207
    """Return the normalized Manhattan similarity of two strings.
208
209
    Manhattan similarity is the complement of Manhattan distance:
210
    :math:`sim_{Manhattan} = 1 - dist_{Manhattan}`.
211
212
    :param str src: source string (or QGrams/Counter objects) for comparison
213
    :param str tar: target string (or QGrams/Counter objects) for comparison
214
    :param int qval: the length of each q-gram; 0 for non-q-gram version
215
    :param collection or int alphabet: the values or size of the alphabet
216
    :returns: the normalized Manhattan similarity
217
    :rtype: float
218
219
    >>> sim_manhattan('cat', 'hat')
220
    0.5
221
    >>> round(sim_manhattan('Niall', 'Neil'), 12)
222
    0.363636363636
223
    >>> round(sim_manhattan('Colin', 'Cuilen'), 12)
224
    0.307692307692
225
    >>> sim_manhattan('ATCG', 'TAGC')
226
    0.0
227
    """
228 1
    return 1 - manhattan(src, tar, qval, True, alphabet)
229
230
231 1
def euclidean(src, tar, qval=2, normalized=False, alphabet=None):
232
    """Return the Euclidean distance between two strings.
233
234
    Euclidean distance is the straigh-line or as-the-crow-flies distance,
235
    equivalent to Minkowski distance in :math:`L^2`-space.
236
237
    :param str src: source string (or QGrams/Counter objects) for comparison
238
    :param str tar: target string (or QGrams/Counter objects) for comparison
239
    :param int qval: the length of each q-gram; 0 for non-q-gram version
240
    :param normalized: normalizes to [0, 1] if True
241
    :param collection or int alphabet: the values or size of the alphabet
242
    :returns: the Euclidean distance
243
    :rtype: float
244
245
    >>> euclidean('cat', 'hat')
246
    2.0
247
    >>> round(euclidean('Niall', 'Neil'), 12)
248
    2.645751311065
249
    >>> euclidean('Colin', 'Cuilen')
250
    3.0
251
    >>> round(euclidean('ATCG', 'TAGC'), 12)
252
    3.162277660168
253
    """
254 1
    return minkowski(src, tar, qval, 2, normalized, alphabet)
255
256
257 1
def dist_euclidean(src, tar, qval=2, alphabet=None):
258
    """Return the normalized Euclidean distance between two strings.
259
260
    The normalized Euclidean distance is a distance
261
    metric in :math:`L^2-space`, normalized to [0, 1].
262
263
    :param str src: source string (or QGrams/Counter objects) for comparison
264
    :param str tar: target string (or QGrams/Counter objects) for comparison
265
    :param int qval: the length of each q-gram; 0 for non-q-gram version
266
    :param collection or int alphabet: the values or size of the alphabet
267
    :returns: the normalized Euclidean distance
268
    :rtype: float
269
270
    >>> round(dist_euclidean('cat', 'hat'), 12)
271
    0.57735026919
272
    >>> round(dist_euclidean('Niall', 'Neil'), 12)
273
    0.683130051064
274
    >>> round(dist_euclidean('Colin', 'Cuilen'), 12)
275
    0.727606875109
276
    >>> dist_euclidean('ATCG', 'TAGC')
277
    1.0
278
    """
279 1
    return euclidean(src, tar, qval, True, alphabet)
280
281
282 1
def sim_euclidean(src, tar, qval=2, alphabet=None):
283
    """Return the normalized Euclidean similarity of two strings.
284
285
    Euclidean similarity is the complement of Euclidean distance:
286
    :math:`sim_{Euclidean} = 1 - dist_{Euclidean}`.
287
288
    :param str src: source string (or QGrams/Counter objects) for comparison
289
    :param str tar: target string (or QGrams/Counter objects) for comparison
290
    :param int qval: the length of each q-gram; 0 for non-q-gram version
291
    :param collection or int alphabet: the values or size of the alphabet
292
    :returns: the normalized Euclidean similarity
293
    :rtype: float
294
295
    >>> round(sim_euclidean('cat', 'hat'), 12)
296
    0.42264973081
297
    >>> round(sim_euclidean('Niall', 'Neil'), 12)
298
    0.316869948936
299
    >>> round(sim_euclidean('Colin', 'Cuilen'), 12)
300
    0.272393124891
301
    >>> sim_euclidean('ATCG', 'TAGC')
302
    0.0
303
    """
304 1
    return 1 - euclidean(src, tar, qval, True, alphabet)
305
306
307 1
def chebyshev(src, tar, qval=2, normalized=False, alphabet=None):
308
    r"""Return the Chebyshev distance between two strings.
309
310
    Euclidean distance is the chessboard distance,
311
    equivalent to Minkowski distance in :math:`L^\infty-space`.
312
313
    :param str src: source string (or QGrams/Counter objects) for comparison
314
    :param str tar: target string (or QGrams/Counter objects) for comparison
315
    :param int qval: the length of each q-gram; 0 for non-q-gram version
316
    :param normalized: normalizes to [0, 1] if True
317
    :param collection or int alphabet: the values or size of the alphabet
318
    :returns: the Chebyshev distance
319
    :rtype: float
320
321
    >>> chebyshev('cat', 'hat')
322
    1.0
323
    >>> chebyshev('Niall', 'Neil')
324
    1.0
325
    >>> chebyshev('Colin', 'Cuilen')
326
    1.0
327
    >>> chebyshev('ATCG', 'TAGC')
328
    1.0
329
    >>> chebyshev('ATCG', 'TAGC', qval=1)
330
    0.0
331
    >>> chebyshev('ATCGATTCGGAATTTC', 'TAGCATAATCGCCG', qval=1)
332
    3.0
333
    """
334 1
    return minkowski(src, tar, qval, float('inf'), normalized, alphabet)
335
336
337
if __name__ == '__main__':
338
    import doctest
339
340
    doctest.testmod()
341