Completed
Push — master ( d771dc...e4e852 )
by Chris
10:27
created

abydos.distance.dist_eudex()   A

Complexity

Conditions 1

Size

Total Lines 19
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 4
dl 0
loc 19
rs 10
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
0 ignored issues
show
coding-style introduced by
Too many lines in module (3132/1000)
Loading history...
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.distance.
20
21
The distance module implements string edit distance functions including:
22
23
    - Levenshtein distance (incl. a [0, 1] normalized variant)
24
    - Optimal String Alignment distance (incl. a [0, 1] normalized variant)
25
    - Levenshtein-Damerau distance (incl. a [0, 1] normalized variant)
26
    - Hamming distance (incl. a [0, 1] normalized variant)
27
    - Tversky index
28
    - Sørensen–Dice coefficient & distance
29
    - Jaccard similarity coefficient & distance
30
    - overlap similarity & distance
31
    - Tanimoto coefficient & distance
32
    - Minkowski distance & similarity (incl. a [0, 1] normalized option)
33
    - Manhattan distance & similarity (incl. a [0, 1] normalized option)
34
    - Euclidean distance & similarity (incl. a [0, 1] normalized option)
35
    - Chebyshev distance & similarity (incl. a [0, 1] normalized option)
36
    - cosine similarity & distance
37
    - Jaro distance
38
    - Jaro-Winkler distance (incl. the strcmp95 algorithm variant)
39
    - Longest common substring
40
    - Ratcliff-Obershelp similarity & distance
41
    - Match Rating Algorithm similarity
42
    - Normalized Compression Distance (NCD) & similarity
43
    - Monge-Elkan similarity & distance
44
    - Matrix similarity
45
    - Needleman-Wunsch score
46
    - Smither-Waterman score
47
    - Gotoh score
48
    - Length similarity
49
    - Prefix, Suffix, and Identity similarity & distance
50
    - Modified Language-Independent Product Name Search (MLIPNS) similarity &
51
      distance
52
    - Bag distance (incl. a [0, 1] normalized variant)
53
    - Editex distance (incl. a [0, 1] normalized variant)
54
    - Eudex distances
55
    - TF-IDF similarity
56
57
Functions beginning with the prefixes 'sim' and 'dist' are guaranteed to be
58
in the range [0, 1], and sim_X = 1 - dist_X since the two are complements.
59
If a sim_X function is supplied identical src & tar arguments, it is guaranteed
60
to return 1; the corresponding dist_X function is guaranteed to return 0.
61
"""
62
63
from __future__ import division, unicode_literals
64
65
import codecs
66
import math
67
import sys
68
import types
69
import unicodedata
70
from collections import Counter, defaultdict
71
72
import numpy as np
73
74
from six import text_type
75
from six.moves import range
76
77
from .compression import ac_encode, ac_train, rle_encode
78
from .phonetic import eudex, mra
79
from .qgram import QGrams
80
81
try:
82
    import lzma
83
except ImportError:  # pragma: no cover
84
    # If the system lacks the lzma library, that's fine, but lzma comrpession
85
    # similarity won't be supported.
86
    pass
87
88
89
def levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
90
    """Return the Levenshtein distance between two strings.
91
92
    Levenshtein distance
93
94
    This is the standard edit distance measure. Cf.
95
    https://en.wikipedia.org/wiki/Levenshtein_distance
96
97
    Two additional variants: optimal string alignment (aka restricted
98
    Damerau-Levenshtein distance) and the Damerau-Levenshtein distance
99
    are also supported. Cf.
100
    https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
101
102
    The ordinary Levenshtein & Optimal String Alignment distance both
103
    employ the Wagner-Fischer dynamic programming algorithm. Cf.
104
    https://en.wikipedia.org/wiki/Wagner%E2%80%93Fischer_algorithm
105
106
    Levenshtein edit distance ordinarily has unit insertion, deletion, and
107
    substitution costs.
108
109
    :param str src, tar: two strings to be compared
110
    :param str mode: specifies a mode for computing the Levenshtein distance:
111
112
        - 'lev' (default) computes the ordinary Levenshtein distance,
113
          in which edits may include inserts, deletes, and substitutions
114
        - 'osa' computes the Optimal String Alignment distance, in which
115
          edits may include inserts, deletes, substitutions, and
116
          transpositions but substrings may only be edited once
117
        - 'dam' computes the Damerau-Levenshtein distance, in which
118
          edits may include inserts, deletes, substitutions, and
119
          transpositions and substrings may undergo repeated edits
120
121
    :param tuple cost: a 4-tuple representing the cost of the four possible
122
        edits: inserts, deletes, substitutions, and transpositions,
123
        respectively (by default: (1, 1, 1, 1))
124
    :returns: the Levenshtein distance between src & tar
125
    :rtype: int (may return a float if cost has float values)
126
127
    >>> levenshtein('cat', 'hat')
128
    1
129
    >>> levenshtein('Niall', 'Neil')
130
    3
131
    >>> levenshtein('aluminum', 'Catalan')
132
    7
133
    >>> levenshtein('ATCG', 'TAGC')
134
    3
135
136
    >>> levenshtein('ATCG', 'TAGC', mode='osa')
137
    2
138
    >>> levenshtein('ACTG', 'TAGC', mode='osa')
139
    4
140
141
    >>> levenshtein('ATCG', 'TAGC', mode='dam')
142
    2
143
    >>> levenshtein('ACTG', 'TAGC', mode='dam')
144
    3
145
    """
146
    ins_cost, del_cost, sub_cost, trans_cost = cost
147
148
    if src == tar:
149
        return 0
150
    if not src:
151
        return len(tar) * ins_cost
152
    if not tar:
153
        return len(src) * del_cost
154
155
    if 'dam' in mode:
156
        return damerau_levenshtein(src, tar, cost)
157
158
    # pylint: disable=no-member
159
    d_mat = np.zeros((len(src)+1, len(tar)+1), dtype=np.int)
160
    # pylint: enable=no-member
161
    for i in range(len(src)+1):
162
        d_mat[i, 0] = i * del_cost
163
    for j in range(len(tar)+1):
164
        d_mat[0, j] = j * ins_cost
165
166
    for i in range(len(src)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
167
        for j in range(len(tar)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
168
            d_mat[i+1, j+1] = min(
169
                d_mat[i+1, j] + ins_cost,  # ins
170
                d_mat[i, j+1] + del_cost,  # del
171
                d_mat[i, j] + (sub_cost if src[i] != tar[j] else 0)  # sub/==
172
            )
173
174
            if mode == 'osa':
175
                if ((i+1 > 1 and j+1 > 1 and src[i] == tar[j-1] and
176
                     src[i-1] == tar[j])):
177
                    # transposition
178
                    d_mat[i+1, j+1] = min(d_mat[i+1, j+1],
179
                                          d_mat[i-1, j-1] + trans_cost)
180
181
    return d_mat[len(src), len(tar)]
182
183
184
def dist_levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
185
    """Return the normalized Levenshtein distance between two strings.
186
187
    Levenshtein distance normalized to the interval [0, 1]
188
189
    The Levenshtein distance is normalized by dividing the Levenshtein distance
190
    (calculated by any of the three supported methods) by the greater of
191
    the number of characters in src times the cost of a delete and
192
    the number of characters in tar times the cost of an insert.
193
    For the case in which all operations have :math:`cost = 1`, this is
194
    equivalent to the greater of the length of the two strings src & tar.
195
196
    :param str src, tar: two strings to be compared
197
    :param str mode: specifies a mode for computing the Levenshtein distance:
198
199
        - 'lev' (default) computes the ordinary Levenshtein distance,
200
          in which edits may include inserts, deletes, and substitutions
201
        - 'osa' computes the Optimal String Alignment distance, in which
202
          edits may include inserts, deletes, substitutions, and
203
          transpositions but substrings may only be edited once
204
        - 'dam' computes the Damerau-Levenshtein distance, in which
205
          edits may include inserts, deletes, substitutions, and
206
          transpositions and substrings may undergo repeated edits
207
208
    :param tuple cost: a 4-tuple representing the cost of the four possible
209
        edits: inserts, deletes, substitutions, and transpositions,
210
        respectively (by default: (1, 1, 1, 1))
211
    :returns: normalized Levenshtein distance
212
    :rtype: float
213
214
    >>> dist_levenshtein('cat', 'hat')
215
    0.33333333333333331
216
    >>> dist_levenshtein('Niall', 'Neil')
217
    0.59999999999999998
218
    >>> dist_levenshtein('aluminum', 'Catalan')
219
    0.875
220
    >>> dist_levenshtein('ATCG', 'TAGC')
221
    0.75
222
    """
223
    if src == tar:
224
        return 0
225
    ins_cost, del_cost = cost[:2]
226
    return (levenshtein(src, tar, mode, cost) /
227
            (max(len(src)*del_cost, len(tar)*ins_cost)))
228
229
230
def sim_levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
231
    """Return the Levenshtein similarity of two strings.
232
233
    Levenshtein similarity normalized to the interval [0, 1]
234
235
    Levenshtein similarity the complement of Levenshtein distance:
236
    :math:`sim_{Levenshtein} = 1 - dist_{Levenshtein}`
237
238
    The arguments are identical to those of the levenshtein() function.
239
240
    :param str src, tar: two strings to be compared
241
    :param str mode: specifies a mode for computing the Levenshtein distance:
242
243
            - 'lev' (default) computes the ordinary Levenshtein distance,
244
              in which edits may include inserts, deletes, and substitutions
245
            - 'osa' computes the Optimal String Alignment distance, in which
246
              edits may include inserts, deletes, substitutions, and
247
              transpositions but substrings may only be edited once
248
            - 'dam' computes the Damerau-Levenshtein distance, in which
249
              edits may include inserts, deletes, substitutions, and
250
              transpositions and substrings may undergo repeated edits
251
252
    :param tuple cost: a 4-tuple representing the cost of the four possible
253
        edits:
254
        inserts, deletes, substitutions, and transpositions, respectively
255
        (by default: (1, 1, 1, 1))
256
    :returns: normalized Levenshtein similarity
257
    :rtype: float
258
259
    >>> sim_levenshtein('cat', 'hat')
260
    0.66666666666666674
261
    >>> sim_levenshtein('Niall', 'Neil')
262
    0.40000000000000002
263
    >>> sim_levenshtein('aluminum', 'Catalan')
264
    0.125
265
    >>> sim_levenshtein('ATCG', 'TAGC')
266
    0.25
267
    """
268
    return 1 - dist_levenshtein(src, tar, mode, cost)
269
270
271
def damerau_levenshtein(src, tar, cost=(1, 1, 1, 1)):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (20/15).
Loading history...
272
    """Return the Damerau-Levenshtein distance between two strings.
273
274
    Damerau-Levenshtein distance
275
276
    This computes the Damerau-Levenshtein distance. Cf.
277
    https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
278
279
    Damerau-Levenshtein code based on Java code by Kevin L. Stern,
280
    under the MIT license:
281
    https://github.com/KevinStern/software-and-algorithms/blob/master/src/main/java/blogspot/software_and_algorithms/stern_library/string/DamerauLevenshteinAlgorithm.java
282
283
    :param str src, tar: two strings to be compared
284
    :param tuple cost: a 4-tuple representing the cost of the four possible
285
        edits:
286
        inserts, deletes, substitutions, and transpositions, respectively
287
        (by default: (1, 1, 1, 1))
288
    :returns: the Damerau-Levenshtein distance between src & tar
289
    :rtype: int (may return a float if cost has float values)
290
291
    >>> damerau_levenshtein('cat', 'hat')
292
    1
293
    >>> damerau_levenshtein('Niall', 'Neil')
294
    3
295
    >>> damerau_levenshtein('aluminum', 'Catalan')
296
    7
297
    >>> damerau_levenshtein('ATCG', 'TAGC')
298
    2
299
    """
300
    ins_cost, del_cost, sub_cost, trans_cost = cost
301
302
    if src == tar:
303
        return 0
304
    if not src:
305
        return len(tar) * ins_cost
306
    if not tar:
307
        return len(src) * del_cost
308
309
    if 2*trans_cost < ins_cost + del_cost:
310
        raise ValueError('Unsupported cost assignment; the cost of two ' +
311
                         'transpositions must not be less than the cost of ' +
312
                         'an insert plus a delete.')
313
314
    # pylint: disable=no-member
315
    d_mat = (np.zeros((len(src))*(len(tar)), dtype=np.int).
316
             reshape((len(src), len(tar))))
317
    # pylint: enable=no-member
318
319
    if src[0] != tar[0]:
320
        d_mat[0, 0] = min(sub_cost, ins_cost + del_cost)
321
322
    src_index_by_character = {}
323
    src_index_by_character[src[0]] = 0
324
    for i in range(1, len(src)):
325
        del_distance = d_mat[i-1, 0] + del_cost
326
        ins_distance = (i+1) * del_cost + ins_cost
327
        match_distance = (i * del_cost +
328
                          (0 if src[i] == tar[0] else sub_cost))
329
        d_mat[i, 0] = min(del_distance, ins_distance, match_distance)
330
331
    for j in range(1, len(tar)):
332
        del_distance = (j+1) * ins_cost + del_cost
333
        ins_distance = d_mat[0, j-1] + ins_cost
334
        match_distance = (j * ins_cost +
335
                          (0 if src[0] == tar[j] else sub_cost))
336
        d_mat[0, j] = min(del_distance, ins_distance, match_distance)
337
338
    for i in range(1, len(src)):
339
        max_src_letter_match_index = (0 if src[i] == tar[0] else -1)
340
        for j in range(1, len(tar)):
341
            candidate_swap_index = (-1 if tar[j] not in
342
                                    src_index_by_character else
343
                                    src_index_by_character[tar[j]])
344
            j_swap = max_src_letter_match_index
345
            del_distance = d_mat[i-1, j] + del_cost
346
            ins_distance = d_mat[i, j-1] + ins_cost
347
            match_distance = d_mat[i-1, j-1]
348
            if src[i] != tar[j]:
349
                match_distance += sub_cost
350
            else:
351
                max_src_letter_match_index = j
352
353
            if candidate_swap_index != -1 and j_swap != -1:
354
                i_swap = candidate_swap_index
355
356
                if i_swap == 0 and j_swap == 0:
357
                    pre_swap_cost = 0
358
                else:
359
                    pre_swap_cost = d_mat[max(0, i_swap-1), max(0, j_swap-1)]
360
                swap_distance = (pre_swap_cost + (i - i_swap - 1) *
361
                                 del_cost + (j - j_swap - 1) * ins_cost +
362
                                 trans_cost)
363
            else:
364
                swap_distance = sys.maxsize
365
366
            d_mat[i, j] = min(del_distance, ins_distance,
367
                              match_distance, swap_distance)
368
        src_index_by_character[src[i]] = i
369
370
    return d_mat[len(src)-1, len(tar)-1]
371
372
373
def dist_damerau(src, tar, cost=(1, 1, 1, 1)):
374
    """Return the Damerau-Levenshtein similarity of two strings.
375
376
    Damerau-Levenshtein distance normalized to the interval [0, 1]
377
378
    The Damerau-Levenshtein distance is normalized by dividing the
379
    Damerau-Levenshtein distance by the greater of
380
    the number of characters in src times the cost of a delete and
381
    the number of characters in tar times the cost of an insert.
382
    For the case in which all operations have :math:`cost = 1`, this is
383
    equivalent to the greater of the length of the two strings src & tar.
384
385
    The arguments are identical to those of the levenshtein() function.
386
387
    :param str src, tar: two strings to be compared
388
    :param tuple cost: a 4-tuple representing the cost of the four possible
389
        edits:
390
        inserts, deletes, substitutions, and transpositions, respectively
391
        (by default: (1, 1, 1, 1))
392
    :returns: normalized Damerau-Levenshtein distance
393
    :rtype: float
394
395
    >>> dist_damerau('cat', 'hat')
396
    0.33333333333333331
397
    >>> dist_damerau('Niall', 'Neil')
398
    0.59999999999999998
399
    >>> dist_damerau('aluminum', 'Catalan')
400
    0.875
401
    >>> dist_damerau('ATCG', 'TAGC')
402
    0.5
403
    """
404
    if src == tar:
405
        return 0
406
    ins_cost, del_cost = cost[:2]
407
    return (damerau_levenshtein(src, tar, cost) /
408
            (max(len(src)*del_cost, len(tar)*ins_cost)))
409
410
411
def sim_damerau(src, tar, cost=(1, 1, 1, 1)):
412
    """Return the Damerau-Levenshtein similarity of two strings.
413
414
    Damerau-Levenshtein similarity normalized to the interval [0, 1]
415
416
    Damerau-Levenshtein similarity the complement of Damerau-Levenshtein
417
    distance:
418
    :math:`sim_{Damerau} = 1 - dist_{Damerau}`
419
420
    The arguments are identical to those of the levenshtein() function.
421
422
    :param str src, tar: two strings to be compared
423
    :param tuple cost: a 4-tuple representing the cost of the four possible
424
        edits:
425
        inserts, deletes, substitutions, and transpositions, respectively
426
        (by default: (1, 1, 1, 1))
427
    :returns: normalized Damerau-Levenshtein similarity
428
    :rtype: float
429
430
    >>> sim_damerau('cat', 'hat')
431
    0.66666666666666674
432
    >>> sim_damerau('Niall', 'Neil')
433
    0.40000000000000002
434
    >>> sim_damerau('aluminum', 'Catalan')
435
    0.125
436
    >>> sim_damerau('ATCG', 'TAGC')
437
    0.5
438
    """
439
    return 1 - dist_damerau(src, tar, cost)
440
441
442
def hamming(src, tar, difflens=True):
443
    """Return the Hamming distance between two strings.
444
445
    Hamming distance
446
447
    Hamming distance equals the number of character positions at which two
448
    strings differ. For strings of unequal lengths, it is not normally defined.
449
    By default, this implementation calculates the Hamming distance of the
450
    first n characters where n is the lesser of the two strings' lengths and
451
    adds to this the difference in string lengths.
452
453
    :param str src, tar: two strings to be compared
454
    :param bool allow_different_lengths:
455
        If True (default), this returns the Hamming distance for those
456
        characters that have a matching character in both strings plus the
457
        difference in the strings' lengths. This is equivalent to extending
458
        the shorter string with obligatorily non-matching characters.
459
        If False, an exception is raised in the case of strings of unequal
460
        lengths.
461
    :returns: the Hamming distance between src & tar
462
    :rtype: int
463
464
    >>> hamming('cat', 'hat')
465
    1
466
    >>> hamming('Niall', 'Neil')
467
    3
468
    >>> hamming('aluminum', 'Catalan')
469
    8
470
    >>> hamming('ATCG', 'TAGC')
471
    4
472
    """
473
    if not difflens and len(src) != len(tar):
474
        raise ValueError('Undefined for sequences of unequal length; set ' +
475
                         'difflens to True for Hamming distance between ' +
476
                         'strings of unequal lengths.')
477
478
    hdist = 0
479
    if difflens:
480
        hdist += abs(len(src)-len(tar))
481
    hdist += sum(c1 != c2 for c1, c2 in zip(src, tar))
482
483
    return hdist
484
485
486
def dist_hamming(src, tar, difflens=True):
487
    """Return the normalized Hamming distance between two strings.
488
489
    Hamming distance normalized to the interval [0, 1]
490
491
    The Hamming distance is normalized by dividing it
492
    by the greater of the number of characters in src & tar (unless difflens is
493
    set to False, in which case an exception is raised).
494
495
    The arguments are identical to those of the hamming() function.
496
497
    :param str src, tar: two strings to be compared
498
    :param bool allow_different_lengths:
499
        If True (default), this returns the Hamming distance for those
500
        characters that have a matching character in both strings plus the
501
        difference in the strings' lengths. This is equivalent to extending
502
        the shorter string with obligatorily non-matching characters.
503
        If False, an exception is raised in the case of strings of unequal
504
        lengths.
505
    :returns: normalized Hamming distance
506
    :rtype: float
507
508
    >>> dist_hamming('cat', 'hat')
509
    0.3333333333333333
510
    >>> dist_hamming('Niall', 'Neil')
511
    0.6
512
    >>> dist_hamming('aluminum', 'Catalan')
513
    1.0
514
    >>> dist_hamming('ATCG', 'TAGC')
515
    1.0
516
    """
517
    if src == tar:
518
        return 0
519
    return hamming(src, tar, difflens) / max(len(src), len(tar))
520
521
522
def sim_hamming(src, tar, difflens=True):
523
    """Return the normalized Hamming similarity of two strings.
524
525
    Hamming similarity normalized to the interval [0, 1]
526
527
    Hamming similarity is the complement of normalized Hamming distance:
528
    :math:`sim_{Hamming} = 1 - dist{Hamming}`
529
530
    Provided that difflens==True, the Hamming similarity is identical to the
531
    Language-Independent Product Name Search (LIPNS) similarity score. For
532
    further information, see the sim_mlipns documentation.
533
534
    The arguments are identical to those of the hamming() function.
535
536
    :param str src, tar: two strings to be compared
537
    :param bool allow_different_lengths:
538
        If True (default), this returns the Hamming distance for those
539
        characters that have a matching character in both strings plus the
540
        difference in the strings' lengths. This is equivalent to extending
541
        the shorter string with obligatorily non-matching characters.
542
        If False, an exception is raised in the case of strings of unequal
543
        lengths.
544
    :returns: normalized Hamming similarity
545
    :rtype: float
546
547
    >>> sim_hamming('cat', 'hat')
548
    0.6666666666666667
549
    >>> sim_hamming('Niall', 'Neil')
550
    0.4
551
    >>> sim_hamming('aluminum', 'Catalan')
552
    0.0
553
    >>> sim_hamming('ATCG', 'TAGC')
554
    0.0
555
    """
556
    return 1 - dist_hamming(src, tar, difflens)
557
558
559
def _get_qgrams(src, tar, qval):
560
    """Return the Q-Grams in src & tar.
561
562
    :param str src, tar: two strings to be compared
563
        (or QGrams/Counter objects)
564
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
565
        version
566
    :return: Q-Grams
567
    """
568
    if isinstance(src, Counter) and isinstance(tar, Counter):
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
569
        return src, tar
570
    elif qval and qval > 0:
571
        return QGrams(src, qval), QGrams(tar, qval)
572
    else:
573
        return Counter(src.strip().split()), Counter(tar.strip().split())
574
575
576
def sim_tversky(src, tar, qval=2, alpha=1, beta=1, bias=None):
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
577
    r"""Return the Tversky index of two strings.
578
579
    Tversky index
580
581
    The Tversky index is defined as:
582
    For two sets X and Y:
583
    :math:`sim_{Tversky}(X, Y) = \\frac{|X \\cap Y|}
584
    {|X \\cap Y| + \\alpha|X - Y| + \\beta|Y - X|}`
585
586
    Cf. https://en.wikipedia.org/wiki/Tversky_index
587
588
    :math:`\\alpha = \\beta = 1` is equivalent to the Jaccard & Tanimoto
589
    similarity coefficients.
590
591
    :math:`\\alpha = \\beta = 0.5` is equivalent to the Sørensen-Dice
592
    similarity coefficient.
593
594
    Unequal α and β will tend to emphasize one or the other set's
595
    contributions:
596
597
        - :math:`\\alpha > \\beta` emphasizes the contributions of X over Y
598
        - :math:`\\alpha < \\beta` emphasizes the contributions of Y over X)
599
600
    Parameter values' relation to 1 emphasizes different types of
601
    contributions:
602
603
        - :math:`\\alpha and \\beta > 1` emphsize unique contributions over the
604
          intersection
605
        - :math:`\\alpha and \\beta < 1` emphsize the intersection over unique
606
          contributions
607
608
    The symmetric variant is defined in Jiminez, Sergio, Claudio Becerra, and
609
    Alexander Gelbukh. 2013. SOFTCARDINALITY-CORE: Improving Text Overlap with
610
    Distributional Measures for Semantic Textual Similarity. This is activated
611
    by specifying a bias parameter.
612
    Cf. http://aclweb.org/anthology/S/S13/S13-1028.pdf
613
614
    :param str src, tar: two strings to be compared
615
        (or QGrams/Counter objects)
616
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
617
        version
618
    :param float alpha, beta: two Tversky index parameters as indicated in the
619
        description below
620
    :returns: Tversky similarity
621
    :rtype: float
622
623
    >>> sim_tversky('cat', 'hat')
624
    0.3333333333333333
625
    >>> sim_tversky('Niall', 'Neil')
626
    0.2222222222222222
627
    >>> sim_tversky('aluminum', 'Catalan')
628
    0.0625
629
    >>> sim_tversky('ATCG', 'TAGC')
630
    0.0
631
    """
632
    if alpha < 0 or beta < 0:
633
        raise ValueError('Unsupported weight assignment; alpha and beta ' +
634
                         'must be greater than or equal to 0.')
635
636
    if src == tar:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
637
        return 1.0
638
    elif not src or not tar:
639
        return 0.0
640
641
    q_src, q_tar = _get_qgrams(src, tar, qval)
642
    q_src_mag = sum(q_src.values())
643
    q_tar_mag = sum(q_tar.values())
644
    q_intersection_mag = sum((q_src & q_tar).values())
645
646
    if not q_src or not q_tar:
647
        return 0.0
648
649
    if bias is None:
650
        return q_intersection_mag / (q_intersection_mag + alpha *
651
                                     (q_src_mag - q_intersection_mag) +
652
                                     beta * (q_tar_mag - q_intersection_mag))
653
654
    a_val = min(q_src_mag - q_intersection_mag,
655
                q_tar_mag - q_intersection_mag)
656
    b_val = max(q_src_mag - q_intersection_mag,
657
                q_tar_mag - q_intersection_mag)
658
    c_val = q_intersection_mag + bias
659
    return c_val / (beta * (alpha * a_val + (1 - alpha) * b_val) + c_val)
660
661
662
def dist_tversky(src, tar, qval=2, alpha=1, beta=1, bias=None):
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
663
    """Return the Tverssky distance between two strings.
664
665
    Tversky distance
666
667
    Tversky distance is the complement of the Tvesrsky index (similarity):
668
    :math:`dist_{Tversky} = 1-sim_{Tversky}`
669
670
    The symmetric variant is defined in Jiminez, Sergio, Claudio Becerra, and
671
    Alexander Gelbukh. 2013. SOFTCARDINALITY-CORE: Improving Text Overlap with
672
    Distributional Measures for Semantic Textual Similarity. This is activated
673
    by specifying a bias parameter.
674
    Cf. http://aclweb.org/anthology/S/S13/S13-1028.pdf
675
676
    :param str src, tar: two strings to be compared
677
        (or QGrams/Counter objects)
678
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
679
        version
680
    :param float alpha, beta: two Tversky index parameters as indicated in the
681
        description below
682
    :returns: Tversky distance
683
    :rtype: float
684
685
    >>> dist_tversky('cat', 'hat')
686
    0.6666666666666667
687
    >>> dist_tversky('Niall', 'Neil')
688
    0.7777777777777778
689
    >>> dist_tversky('aluminum', 'Catalan')
690
    0.9375
691
    >>> dist_tversky('ATCG', 'TAGC')
692
    1.0
693
    """
694
    return 1 - sim_tversky(src, tar, qval, alpha, beta, bias)
695
696
697
def sim_dice(src, tar, qval=2):
698
    r"""Return the Sørensen–Dice coefficient of two strings.
699
700
    Sørensen–Dice coefficient
701
702
    For two sets X and Y, the Sørensen–Dice coefficient is
703
    :math:`sim_{dice}(X, Y) = \\frac{2 \\cdot |X \\cap Y|}{|X| + |Y|}`
704
705
    This is identical to the Tanimoto similarity coefficient
706
    and the Tversky index for :math:`\\alpha = \\beta = 0.5`
707
708
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
709
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
710
        version
711
    :returns: Sørensen–Dice similarity
712
    :rtype: float
713
714
    >>> sim_dice('cat', 'hat')
715
    0.5
716
    >>> sim_dice('Niall', 'Neil')
717
    0.36363636363636365
718
    >>> sim_dice('aluminum', 'Catalan')
719
    0.11764705882352941
720
    >>> sim_dice('ATCG', 'TAGC')
721
    0.0
722
    """
723
    return sim_tversky(src, tar, qval, 0.5, 0.5)
724
725
726
def dist_dice(src, tar, qval=2):
727
    """Return the Sørensen–Dice distance between two strings.
728
729
    Sørensen–Dice distance
730
731
    Sørensen–Dice distance is the complemenjt of the Sørensen–Dice coefficient:
732
    :math:`dist_{dice} = 1 - sim_{dice}`
733
734
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
735
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
736
        version
737
    :returns: Sørensen–Dice distance
738
    :rtype: float
739
740
    >>> dist_dice('cat', 'hat')
741
    0.5
742
    >>> dist_dice('Niall', 'Neil')
743
    0.6363636363636364
744
    >>> dist_dice('aluminum', 'Catalan')
745
    0.8823529411764706
746
    >>> dist_dice('ATCG', 'TAGC')
747
    1.0
748
    """
749
    return 1 - sim_dice(src, tar, qval)
750
751
752
def sim_jaccard(src, tar, qval=2):
753
    r"""Return the Jaccard similarity of two strings.
754
755
    Jaccard similarity coefficient
756
757
    For two sets X and Y, the Jaccard similarity coefficient is
758
    :math:`sim_{jaccard}(X, Y) = \\frac{|X \\cap Y|}{|X \\cup Y|}`
759
760
    This is identical to the Tanimoto similarity coefficient
761
    and the Tversky index for :math:`\\alpha = \\beta = 1`
762
763
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
764
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
765
        version
766
    :returns: Jaccard similarity
767
    :rtype: float
768
769
    >>> sim_jaccard('cat', 'hat')
770
    0.3333333333333333
771
    >>> sim_jaccard('Niall', 'Neil')
772
    0.2222222222222222
773
    >>> sim_jaccard('aluminum', 'Catalan')
774
    0.0625
775
    >>> sim_jaccard('ATCG', 'TAGC')
776
    0.0
777
    """
778
    return sim_tversky(src, tar, qval, 1, 1)
779
780
781
def dist_jaccard(src, tar, qval=2):
782
    """Return the Jaccard distance between two strings.
783
784
    Jaccard distance
785
786
    Jaccard distance is the complement of the Jaccard similarity coefficient:
787
    :math:`dist_{Jaccard} = 1 - sim_{Jaccard}`
788
789
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
790
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
791
        version
792
    :returns: Jaccard distance
793
    :rtype: float
794
795
    >>> dist_jaccard('cat', 'hat')
796
    0.6666666666666667
797
    >>> dist_jaccard('Niall', 'Neil')
798
    0.7777777777777778
799
    >>> dist_jaccard('aluminum', 'Catalan')
800
    0.9375
801
    >>> dist_jaccard('ATCG', 'TAGC')
802
    1.0
803
    """
804
    return 1 - sim_jaccard(src, tar, qval)
805
806
807
def sim_overlap(src, tar, qval=2):
808
    r"""Return the overlap coefficient of two strings.
809
810
    Overlap coefficient
811
812
    For two sets X and Y, the overlap coefficient is
813
    :math:`sim_{overlap}(X, Y) = \\frac{|X \\cap Y|}{min(|X|, |Y|)}`
814
815
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
816
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
817
        version
818
    :returns: overlap similarity
819
    :rtype: float
820
821
    >>> sim_overlap('cat', 'hat')
822
    0.5
823
    >>> sim_overlap('Niall', 'Neil')
824
    0.4
825
    >>> sim_overlap('aluminum', 'Catalan')
826
    0.125
827
    >>> sim_overlap('ATCG', 'TAGC')
828
    0.0
829
    """
830
    if src == tar:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
831
        return 1.0
832
    elif not src or not tar:
833
        return 0.0
834
835
    q_src, q_tar = _get_qgrams(src, tar, qval)
836
    q_src_mag = sum(q_src.values())
837
    q_tar_mag = sum(q_tar.values())
838
    q_intersection_mag = sum((q_src & q_tar).values())
839
840
    return q_intersection_mag / min(q_src_mag, q_tar_mag)
841
842
843
def dist_overlap(src, tar, qval=2):
844
    """Return the overlap distance between two strings.
845
846
    Overlap distance
847
848
    Overlap distance is the complement of the overlap coefficient:
849
    :math:`sim_{overlap} = 1 - dist_{overlap}`
850
851
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
852
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
853
        version
854
    :returns: overlap distance
855
    :rtype: float
856
857
    >>> dist_overlap('cat', 'hat')
858
    0.5
859
    >>> dist_overlap('Niall', 'Neil')
860
    0.6
861
    >>> dist_overlap('aluminum', 'Catalan')
862
    0.875
863
    >>> dist_overlap('ATCG', 'TAGC')
864
    1.0
865
    """
866
    return 1 - sim_overlap(src, tar, qval)
867
868
869
def sim_tanimoto(src, tar, qval=2):
870
    r"""Return the Tanimoto similarity of two strings.
871
872
    Tanimoto similarity
873
874
    For two sets X and Y, the Tanimoto similarity coefficient is
875
    :math:`sim_{Tanimoto}(X, Y) = \\frac{|X \\cap Y|}{|X \\cup Y|}`
876
    This is identical to the Jaccard similarity coefficient
877
    and the Tversky index for :math:`\\alpha = \\beta = 1`
878
879
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
880
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
881
        version
882
    :returns: Tanimoto similarity
883
    :rtype: float
884
885
    >>> sim_tanimoto('cat', 'hat')
886
    0.3333333333333333
887
    >>> sim_tanimoto('Niall', 'Neil')
888
    0.2222222222222222
889
    >>> sim_tanimoto('aluminum', 'Catalan')
890
    0.0625
891
    >>> sim_tanimoto('ATCG', 'TAGC')
892
    0.0
893
    """
894
    return sim_jaccard(src, tar, qval)
895
896
897
def tanimoto(src, tar, qval=2):
898
    """Return the Tanimoto distance between two strings.
899
900
    Tanimoto distance
901
902
    Tanimoto distance is :math:`-log_{2}sim_{Tanimoto}`
903
904
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
905
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
906
        version
907
    :returns: Tanimoto distance
908
    :rtype: float
909
910
    >>> tanimoto('cat', 'hat')
911
    -1.5849625007211563
912
    >>> tanimoto('Niall', 'Neil')
913
    -2.1699250014423126
914
    >>> tanimoto('aluminum', 'Catalan')
915
    -4.0
916
    >>> tanimoto('ATCG', 'TAGC')
917
    -inf
918
    """
919
    coeff = sim_jaccard(src, tar, qval)
920
    if coeff != 0:
921
        return math.log(coeff, 2)
922
923
    return float('-inf')
924
925
926
def minkowski(src, tar, qval=2, pval=1, normalize=False):
927
    """Return the Minkowski distance (:math:`L^p-norm`) of two strings.
928
929
    :param src:
930
    :param tar:
931
    :param qval:
932
    :param pval:
933
    :return:
934
    """
935
    q_src, q_tar = _get_qgrams(src, tar, qval)
936
    diffs = ((q_src - q_tar) + (q_tar - q_src)).values()
937
938
    normalizer = 1
939
    if normalize:
940
        totals = (q_src + q_tar).values()
941
        if pval == 0:
942
            normalizer = len(totals)
943
        else:
944
            normalizer = sum(_**pval for _ in totals)**(1 / pval)
945
946
    if pval == float('inf'):
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
947
        # Chebyshev distance
948
        return max(diffs)/normalizer
949
    elif pval == 0:
950
        # This is the l_0 "norm" as developed by David Donoho
951
        return len(diffs)
952
    else:
953
        return sum(_**pval for _ in diffs)**(1 / pval)/normalizer
954
955
956
def dist_minkowski(src, tar, qval=2, pval=1):
957
    """Return Minkowski distance of two strings, normalized to [0, 1].
958
959
    :param src:
960
    :param tar:
961
    :param qval2:
962
    :param pval:
963
    :return:
964
    """
965
    return minkowski(src, tar, qval, pval, True)
966
967
968
def sim_minkowski(src, tar, qval=2, pval=1):
969
    """Return Minkowski similarity of two strings, normalized to [0, 1].
970
971
    :param src:
972
    :param tar:
973
    :param qval2:
974
    :param pval:
975
    :return:
976
    """
977
    return 1-minkowski(src, tar, qval, pval, True)
978
979
980
def manhattan(src, tar, qval=2, normalize=False):
981
    """Return the Manhattan distance between two strings.
982
983
    :param src:
984
    :param tar:
985
    :param qval:
986
    :return:
987
    """
988
    return minkowski(src, tar, qval, 1, normalize)
989
990
991
def dist_manhattan(src, tar, qval=2):
992
    """Return the Manhattan distance between two strings, normalized to [0, 1].
993
994
    This is identical to Canberra distance.
995
996
    :param src:
997
    :param tar:
998
    :param qval:
999
    :return:
1000
    """
1001
    return manhattan(src, tar, qval, 1, True)
0 ignored issues
show
Bug introduced by
There seem to be too many positional arguments for this function call.
Loading history...
1002
1003
1004
def sim_manhattan(src, tar, qval=2):
1005
    """Return the Manhattan similarity of two strings, normalized to [0, 1].
1006
1007
    :param src:
1008
    :param tar:
1009
    :param qval:
1010
    :return:
1011
    """
1012
    return 1-manhattan(src, tar, qval, 1, True)
0 ignored issues
show
Bug introduced by
There seem to be too many positional arguments for this function call.
Loading history...
1013
1014
1015
def euclidean(src, tar, qval=2, normalize=False):
1016
    """Return the Euclidean distance between two strings.
1017
1018
    :param src:
1019
    :param tar:
1020
    :param qval:
1021
    :return:
1022
    """
1023
    return minkowski(src, tar, qval, 2, normalize)
1024
1025
1026
def dist_euclidean(src, tar, qval=2):
1027
    """Return the Euclidean distance between two strings, normalized to [0, 1].
1028
1029
    :param src:
1030
    :param tar:
1031
    :param qval:
1032
    :return:
1033
    """
1034
    return euclidean(src, tar, qval, True)
1035
1036
1037
def sim_euclidean(src, tar, qval=2):
1038
    """Return the Euclidean similarity of two strings, normalized to [0, 1].
1039
1040
    :param src:
1041
    :param tar:
1042
    :param qval:
1043
    :return:
1044
    """
1045
    return 1-euclidean(src, tar, qval, True)
1046
1047
1048
def chebyshev(src, tar, qval=2, normalize=False):
1049
    """Return the Chebyshev distance between two strings.
1050
1051
    :param src:
1052
    :param tar:
1053
    :param qval:
1054
    :return:
1055
    """
1056
    return minkowski(src, tar, qval, float('inf'), normalize)
1057
1058
1059
def dist_chebyshev(src, tar, qval=2):
1060
    """Return the Chebyshev distance between two strings, normalized to [0, 1].
1061
1062
    :param src:
1063
    :param tar:
1064
    :param qval:
1065
    :return:
1066
    """
1067
    return chebyshev(src, tar, qval, True)
1068
1069
1070
def sim_chebyshev(src, tar, qval=2):
1071
    """Return the Chebyshev similarity of two strings, normalized to [0, 1].
1072
1073
    :param src:
1074
    :param tar:
1075
    :param qval:
1076
    :return:
1077
    """
1078
    return 1 - chebyshev(src, tar, qval, True)
1079
1080
1081
def sim_cosine(src, tar, qval=2):
1082
    r"""Return the cosine similarity of two strings.
1083
1084
    Cosine similarity (Ochiai coefficient)
1085
1086
    For two sets X and Y, the cosine similarity (Ochiai coefficient) is:
1087
    :math:`sim_{cosine}(X, Y) = \\frac{|X \\cap Y|}{\\sqrt{|X| \\cdot |Y|}}`
1088
1089
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1090
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1091
        version
1092
    :returns: cosine similarity
1093
    :rtype: float
1094
1095
    >>> sim_cosine('cat', 'hat')
1096
    0.5
1097
    >>> sim_cosine('Niall', 'Neil')
1098
    0.3651483716701107
1099
    >>> sim_cosine('aluminum', 'Catalan')
1100
    0.11785113019775793
1101
    >>> sim_cosine('ATCG', 'TAGC')
1102
    0.0
1103
    """
1104
    if src == tar:
1105
        return 1.0
1106
    if not src or not tar:
1107
        return 0.0
1108
1109
    q_src, q_tar = _get_qgrams(src, tar, qval)
1110
    q_src_mag = sum(q_src.values())
1111
    q_tar_mag = sum(q_tar.values())
1112
    q_intersection_mag = sum((q_src & q_tar).values())
1113
1114
    return q_intersection_mag / math.sqrt(q_src_mag * q_tar_mag)
1115
1116
1117
def dist_cosine(src, tar, qval=2):
1118
    """Return the cosine distance between two strings.
1119
1120
    Cosine distance
1121
1122
    Cosine distance is the complement of cosine similarity:
1123
    :math:`dist_{cosine} = 1 - sim_{cosine}`
1124
1125
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1126
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1127
        version
1128
    :returns: cosine distance
1129
    :rtype: float
1130
1131
    >>> dist_cosine('cat', 'hat')
1132
    0.5
1133
    >>> dist_cosine('Niall', 'Neil')
1134
    0.6348516283298893
1135
    >>> dist_cosine('aluminum', 'Catalan')
1136
    0.882148869802242
1137
    >>> dist_cosine('ATCG', 'TAGC')
1138
    1.0
1139
    """
1140
    return 1 - sim_cosine(src, tar, qval)
1141
1142
1143
def sim_strcmp95(src, tar, long_strings=False):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (23/15).
Loading history...
1144
    """Return the strcmp95 similarity of two strings.
1145
1146
    strcmp95 similarity
1147
1148
    This is a Python translation of the C code for strcmp95:
1149
    http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
1150
    The above file is a US Government publication and, accordingly,
1151
    in the public domain.
1152
1153
    This is based on the Jaro-Winkler distance, but also attempts to correct
1154
    for some common typos and frequently confused characters. It is also
1155
    limited to uppercase ASCII characters, so it is appropriate to American
1156
    names, but not much else.
1157
1158
    :param str src, tar: two strings to be compared
1159
    :param bool long_strings: set to True to "Increase the probability of a
1160
        match when the number of matched characters is large.  This option
1161
        allows for a little more tolerance when the strings are large. It is
1162
        not an appropriate test when comparing fixed length fields such as
1163
        phone and social security numbers."
1164
    :returns: strcmp95 similarity
1165
    :rtype: float
1166
1167
    >>> sim_strcmp95('cat', 'hat')
1168
    0.7777777777777777
1169
    >>> sim_strcmp95('Niall', 'Neil')
1170
    0.8454999999999999
1171
    >>> sim_strcmp95('aluminum', 'Catalan')
1172
    0.6547619047619048
1173
    >>> sim_strcmp95('ATCG', 'TAGC')
1174
    0.8333333333333334
1175
    """
1176
    def _inrange(char):
1177
        """Return True if char is in the range (0, 91)."""
1178
        return ord(char) > 0 and ord(char) < 91
1179
1180
    ying = src.strip().upper()
1181
    yang = tar.strip().upper()
1182
1183
    if ying == yang:
1184
        return 1.0
1185
    # If either string is blank - return - added in Version 2
1186
    if not ying or not yang:
1187
        return 0.0
1188
1189
    adjwt = defaultdict(int)
1190
    sp_mx = (
1191
        ('A', 'E'), ('A', 'I'), ('A', 'O'), ('A', 'U'), ('B', 'V'), ('E', 'I'),
1192
        ('E', 'O'), ('E', 'U'), ('I', 'O'), ('I', 'U'), ('O', 'U'), ('I', 'Y'),
1193
        ('E', 'Y'), ('C', 'G'), ('E', 'F'), ('W', 'U'), ('W', 'V'), ('X', 'K'),
1194
        ('S', 'Z'), ('X', 'S'), ('Q', 'C'), ('U', 'V'), ('M', 'N'), ('L', 'I'),
1195
        ('Q', 'O'), ('P', 'R'), ('I', 'J'), ('2', 'Z'), ('5', 'S'), ('8', 'B'),
1196
        ('1', 'I'), ('1', 'L'), ('0', 'O'), ('0', 'Q'), ('C', 'K'), ('G', 'J')
1197
    )
1198
1199
    # Initialize the adjwt array on the first call to the function only.
1200
    # The adjwt array is used to give partial credit for characters that
1201
    # may be errors due to known phonetic or character recognition errors.
1202
    # A typical example is to match the letter "O" with the number "0"
1203
    for i in sp_mx:
1204
        adjwt[(i[0], i[1])] = 3
1205
        adjwt[(i[1], i[0])] = 3
1206
1207
    if len(ying) > len(yang):
1208
        search_range = len(ying)
1209
        minv = len(yang)
1210
    else:
1211
        search_range = len(yang)
1212
        minv = len(ying)
1213
1214
    # Blank out the flags
1215
    ying_flag = [0] * search_range
1216
    yang_flag = [0] * search_range
1217
    search_range = max(0, search_range // 2 - 1)
1218
1219
    # Looking only within the search range, count and flag the matched pairs.
1220
    num_com = 0
1221
    yl1 = len(yang) - 1
1222
    for i in range(len(ying)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
1223
        lowlim = (i - search_range) if (i >= search_range) else 0
1224
        hilim = (i + search_range) if ((i + search_range) <= yl1) else yl1
1225
        for j in range(lowlim, hilim+1):
1226
            if (yang_flag[j] == 0) and (yang[j] == ying[i]):
1227
                yang_flag[j] = 1
1228
                ying_flag[i] = 1
1229
                num_com += 1
1230
                break
1231
1232
    # If no characters in common - return
1233
    if num_com == 0:
1234
        return 0.0
1235
1236
    # Count the number of transpositions
1237
    k = n_trans = 0
1238
    for i in range(len(ying)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
1239
        if ying_flag[i] != 0:
1240
            for j in range(k, len(yang)):
1241
                if yang_flag[j] != 0:
1242
                    k = j + 1
1243
                    break
1244
            if ying[i] != yang[j]:
0 ignored issues
show
introduced by
The variable j does not seem to be defined for all execution paths.
Loading history...
1245
                n_trans += 1
1246
    n_trans = n_trans // 2
1247
1248
    # Adjust for similarities in unmatched characters
1249
    n_simi = 0
1250
    if minv > num_com:
0 ignored issues
show
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
1251
        for i in range(len(ying)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
1252
            if ying_flag[i] == 0 and _inrange(ying[i]):
1253
                for j in range(len(yang)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
1254
                    if yang_flag[j] == 0 and _inrange(yang[j]):
1255
                        if (ying[i], yang[j]) in adjwt:
1256
                            n_simi += adjwt[(ying[i], yang[j])]
1257
                            yang_flag[j] = 2
1258
                            break
1259
    num_sim = n_simi/10.0 + num_com
1260
1261
    # Main weight computation
1262
    weight = num_sim / len(ying) + num_sim / len(yang) + \
1263
        (num_com - n_trans) / num_com
1264
    weight = weight / 3.0
1265
1266
    # Continue to boost the weight if the strings are similar
1267
    if weight > 0.7:
1268
1269
        # Adjust for having up to the first 4 characters in common
1270
        j = 4 if (minv >= 4) else minv
1271
        i = 0
1272
        while (i < j) and (ying[i] == yang[i]) and (not ying[i].isdigit()):
1273
            i += 1
1274
        if i:
1275
            weight += i * 0.1 * (1.0 - weight)
1276
1277
        # Optionally adjust for long strings.
1278
1279
        # After agreeing beginning chars, at least two more must agree and
1280
        # the agreeing characters must be > .5 of remaining characters.
1281
        if (((long_strings) and (minv > 4) and (num_com > i+1) and
1282
             (2*num_com >= minv+i))):
1283
            if not ying[0].isdigit():
1284
                weight += (1.0-weight) * ((num_com-i-1) /
1285
                                          (len(ying)+len(yang)-i*2+2))
1286
1287
    return weight
1288
1289
1290
def dist_strcmp95(src, tar, long_strings=False):
1291
    """Return the strcmp95 distance between two strings.
1292
1293
    strcmp95 distance
1294
1295
    strcmp95 distance is 1 - strcmp95 similarity
1296
1297
    :param str src, tar: two strings to be compared
1298
    :param bool long_strings: set to True to "Increase the probability of a
1299
        match when the number of matched characters is large.  This option
1300
        allows for a little more tolerance when the strings are large. It is
1301
        not an appropriate test when comparing fixed length fields such as
1302
        phone and social security numbers."
1303
    :returns: strcmp95 distance
1304
    :rtype: float
1305
1306
    >>> dist_strcmp95('cat', 'hat')
1307
    0.22222222222222232
1308
    >>> dist_strcmp95('Niall', 'Neil')
1309
    0.15450000000000008
1310
    >>> dist_strcmp95('aluminum', 'Catalan')
1311
    0.34523809523809523
1312
    >>> dist_strcmp95('ATCG', 'TAGC')
1313
    0.16666666666666663
1314
    """
1315
    return 1 - sim_strcmp95(src, tar, long_strings)
1316
1317
1318
def sim_jaro_winkler(src, tar, qval=1, mode='winkler', long_strings=False,
0 ignored issues
show
best-practice introduced by
Too many arguments (7/5)
Loading history...
Comprehensibility introduced by
This function exceeds the maximum number of variables (22/15).
Loading history...
1319
                     boost_threshold=0.7, scaling_factor=0.1):
1320
    """Return the Jaro or Jaro-Winkler similarity of two strings.
1321
1322
    Jaro(-Winkler) distance
1323
1324
    This is Python based on the C code for strcmp95:
1325
    http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
1326
    The above file is a US Government publication and, accordingly,
1327
    in the public domain.
1328
1329
    :param str src, tar: two strings to be compared
1330
    :param int qval: the length of each q-gram (defaults to 1: character-wise
1331
        matching)
1332
    :param str mode: indicates which variant of this distance metric to
1333
        compute:
1334
1335
            - 'winkler' -- computes the Jaro-Winkler distance (default) which
1336
              increases the score for matches near the start of the word
1337
            - 'jaro' -- computes the Jaro distance
1338
1339
    The following arguments apply only when mode is 'winkler':
1340
1341
    :param bool long_strings: set to True to "Increase the probability of a
1342
        match when the number of matched characters is large.  This option
1343
        allows for a little more tolerance when the strings are large.  It is
1344
        not an appropriate test when comparing fixed length fields such as
1345
        phone and social security numbers."
1346
    :param float boost_threshold: a value between 0 and 1, below which the
1347
        Winkler boost is not applied (defaults to 0.7)
1348
    :param float scaling_factor: a value between 0 and 0.25, indicating by how
1349
        much to boost scores for matching prefixes (defaults to 0.1)
1350
1351
    :returns: Jaro or Jaro-Winkler similarity
1352
    :rtype: float
1353
1354
    >>> sim_jaro_winkler('cat', 'hat')
1355
    0.7777777777777777
1356
    >>> sim_jaro_winkler('Niall', 'Neil')
1357
    0.8049999999999999
1358
    >>> sim_jaro_winkler('aluminum', 'Catalan')
1359
    0.6011904761904762
1360
    >>> sim_jaro_winkler('ATCG', 'TAGC')
1361
    0.8333333333333334
1362
1363
    >>> sim_jaro_winkler('cat', 'hat', mode='jaro')
1364
    0.7777777777777777
1365
    >>> sim_jaro_winkler('Niall', 'Neil', mode='jaro')
1366
    0.7833333333333333
1367
    >>> sim_jaro_winkler('aluminum', 'Catalan', mode='jaro')
1368
    0.6011904761904762
1369
    >>> sim_jaro_winkler('ATCG', 'TAGC', mode='jaro')
1370
    0.8333333333333334
1371
    """
1372
    if mode == 'winkler':
1373
        if boost_threshold > 1 or boost_threshold < 0:
1374
            raise ValueError('Unsupported boost_threshold assignment; ' +
1375
                             'boost_threshold must be between 0 and 1.')
1376
        if scaling_factor > 0.25 or scaling_factor < 0:
1377
            raise ValueError('Unsupported scaling_factor assignment; ' +
1378
                             'scaling_factor must be between 0 and 0.25.')
1379
1380
    if src == tar:
1381
        return 1.0
1382
1383
    src = QGrams(src.strip(), qval).ordered_list
1384
    tar = QGrams(tar.strip(), qval).ordered_list
1385
1386
    lens = len(src)
1387
    lent = len(tar)
1388
1389
    # If either string is blank - return - added in Version 2
1390
    if lens == 0 or lent == 0:
1391
        return 0.0
1392
1393
    if lens > lent:
1394
        search_range = lens
1395
        minv = lent
1396
    else:
1397
        search_range = lent
1398
        minv = lens
1399
1400
    # Zero out the flags
1401
    src_flag = [0] * search_range
1402
    tar_flag = [0] * search_range
1403
    search_range = max(0, search_range//2 - 1)
1404
1405
    # Looking only within the search range, count and flag the matched pairs.
1406
    num_com = 0
1407
    yl1 = lent - 1
1408
    for i in range(lens):
1409
        lowlim = (i - search_range) if (i >= search_range) else 0
1410
        hilim = (i + search_range) if ((i + search_range) <= yl1) else yl1
1411
        for j in range(lowlim, hilim+1):
1412
            if (tar_flag[j] == 0) and (tar[j] == src[i]):
1413
                tar_flag[j] = 1
1414
                src_flag[i] = 1
1415
                num_com += 1
1416
                break
1417
1418
    # If no characters in common - return
1419
    if num_com == 0:
1420
        return 0.0
1421
1422
    # Count the number of transpositions
1423
    k = n_trans = 0
1424
    for i in range(lens):
1425
        if src_flag[i] != 0:
1426
            for j in range(k, lent):
1427
                if tar_flag[j] != 0:
1428
                    k = j + 1
1429
                    break
1430
            if src[i] != tar[j]:
0 ignored issues
show
introduced by
The variable j does not seem to be defined for all execution paths.
Loading history...
1431
                n_trans += 1
1432
    n_trans = n_trans // 2
1433
1434
    # Main weight computation for Jaro distance
1435
    weight = num_com / lens + num_com / lent + (num_com - n_trans) / num_com
1436
    weight = weight / 3.0
1437
1438
    # Continue to boost the weight if the strings are similar
1439
    # This is the Winkler portion of Jaro-Winkler distance
1440
    if mode == 'winkler' and weight > boost_threshold:
1441
1442
        # Adjust for having up to the first 4 characters in common
1443
        j = 4 if (minv >= 4) else minv
1444
        i = 0
1445
        while (i < j) and (src[i] == tar[i]):
1446
            i += 1
1447
        if i:
1448
            weight += i * scaling_factor * (1.0 - weight)
1449
1450
        # Optionally adjust for long strings.
1451
1452
        # After agreeing beginning chars, at least two more must agree and
1453
        # the agreeing characters must be > .5 of remaining characters.
1454
        if (((long_strings) and (minv > 4) and (num_com > i+1) and
1455
             (2*num_com >= minv+i))):
1456
            weight += (1.0-weight) * ((num_com-i-1) / (lens+lent-i*2+2))
1457
1458
    return weight
1459
1460
1461
def dist_jaro_winkler(src, tar, qval=1, mode='winkler', long_strings=False,
0 ignored issues
show
best-practice introduced by
Too many arguments (7/5)
Loading history...
1462
                      boost_threshold=0.7, scaling_factor=0.1):
1463
    """Return the Jaro or Jaro-Winkler distance between two strings.
1464
1465
    Jaro(-Winkler) distance
1466
1467
    Jaro-Winkler distance is 1 - the Jaro-Winkler similarity
1468
1469
    :param str src, tar: two strings to be compared
1470
    :param int qval: the length of each q-gram (defaults to 1: character-wise
1471
        matching)
1472
    :param str mode: indicates which variant of this distance metric to
1473
        compute:
1474
1475
            - 'winkler' -- computes the Jaro-Winkler distance (default) which
1476
              increases the score for matches near the start of the word
1477
            - 'jaro' -- computes the Jaro distance
1478
1479
    The following arguments apply only when mode is 'winkler':
1480
1481
    :param bool long_strings: set to True to "Increase the probability of a
1482
        match when the number of matched characters is large.  This option
1483
        allows for a little more tolerance when the strings are large.  It is
1484
        not an appropriate test when comparing fixed length fields such as
1485
        phone and social security numbers."
1486
    :param float boost_threshold: a value between 0 and 1, below which the
1487
        Winkler boost is not applied (defaults to 0.7)
1488
    :param float scaling_factor: a value between 0 and 0.25, indicating by how
1489
        much to boost scores for matching prefixes (defaults to 0.1)
1490
1491
    :returns: Jaro or Jaro-Winkler distance
1492
    :rtype: float
1493
1494
    >>> dist_jaro_winkler('cat', 'hat')
1495
    0.22222222222222232
1496
    >>> dist_jaro_winkler('Niall', 'Neil')
1497
    0.19500000000000006
1498
    >>> dist_jaro_winkler('aluminum', 'Catalan')
1499
    0.39880952380952384
1500
    >>> dist_jaro_winkler('ATCG', 'TAGC')
1501
    0.16666666666666663
1502
1503
    >>> dist_jaro_winkler('cat', 'hat', mode='jaro')
1504
    0.22222222222222232
1505
    >>> dist_jaro_winkler('Niall', 'Neil', mode='jaro')
1506
    0.21666666666666667
1507
    >>> dist_jaro_winkler('aluminum', 'Catalan', mode='jaro')
1508
    0.39880952380952384
1509
    >>> dist_jaro_winkler('ATCG', 'TAGC', mode='jaro')
1510
    0.16666666666666663
1511
    """
1512
    return 1 - sim_jaro_winkler(src, tar, qval, mode, long_strings,
1513
                                boost_threshold, scaling_factor)
1514
1515
1516
def lcsseq(src, tar):
1517
    """Return the longest common subsequence of two strings.
1518
1519
    Longest common subsequence (LCSseq)
1520
1521
    Based on the dynamic programming algorithm from
1522
    http://rosettacode.org/wiki/Longest_common_subsequence#Dynamic_Programming_6
1523
    This is licensed GFDL 1.2
1524
1525
    Modifications include:
1526
        conversion to a numpy array in place of a list of lists
1527
1528
    :param str src, tar: two strings to be compared
1529
    :returns: the longes common subsequence
1530
    :rtype: str
1531
1532
    >>> lcsseq('cat', 'hat')
1533
    'at'
1534
    >>> lcsseq('Niall', 'Neil')
1535
    'Nil'
1536
    >>> lcsseq('aluminum', 'Catalan')
1537
    'aln'
1538
    >>> lcsseq('ATCG', 'TAGC')
1539
    'AC'
1540
    """
1541
    # pylint: disable=no-member
1542
    lengths = np.zeros((len(src)+1, len(tar)+1), dtype=np.int)
1543
    # pylint: enable=no-member
1544
1545
    # row 0 and column 0 are initialized to 0 already
1546
    for i, src_char in enumerate(src):
1547
        for j, tar_char in enumerate(tar):
1548
            if src_char == tar_char:
1549
                lengths[i+1, j+1] = lengths[i, j] + 1
1550
            else:
1551
                lengths[i+1, j+1] = max(lengths[i+1, j], lengths[i, j+1])
1552
1553
    # read the substring out from the matrix
1554
    result = ''
1555
    i, j = len(src), len(tar)
1556
    while i != 0 and j != 0:
1557
        if lengths[i, j] == lengths[i-1, j]:
1558
            i -= 1
1559
        elif lengths[i, j] == lengths[i, j-1]:
1560
            j -= 1
1561
        else:
1562
            result = src[i-1] + result
1563
            i -= 1
1564
            j -= 1
1565
    return result
1566
1567
1568
def sim_lcsseq(src, tar):
1569
    r"""Return the longest common subsequence similarity of two strings.
1570
1571
    Longest common subsequence similarity (:math:`sim_{LCSseq}`)
1572
1573
    This employs the LCSseq function to derive a similarity metric:
1574
    :math:`sim_{LCSseq}(s,t) = \\frac{|LCSseq(s,t)|}{max(|s|, |t|)}`
1575
1576
    :param str src, tar: two strings to be compared
1577
    :returns: LCSseq similarity
1578
    :rtype: float
1579
1580
    >>> sim_lcsseq('cat', 'hat')
1581
    0.6666666666666666
1582
    >>> sim_lcsseq('Niall', 'Neil')
1583
    0.6
1584
    >>> sim_lcsseq('aluminum', 'Catalan')
1585
    0.375
1586
    >>> sim_lcsseq('ATCG', 'TAGC')
1587
    0.5
1588
    """
1589
    if src == tar:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
1590
        return 1.0
1591
    elif not src or not tar:
1592
        return 0.0
1593
    return len(lcsseq(src, tar)) / max(len(src), len(tar))
1594
1595
1596
def dist_lcsseq(src, tar):
1597
    """Return the longest common subsequence distance between two strings.
1598
1599
    Longest common subsequence distance (:math:`dist_{LCSseq}`)
1600
1601
    This employs the LCSseq function to derive a similarity metric:
1602
    :math:`dist_{LCSseq}(s,t) = 1 - sim_{LCSseq}(s,t)`
1603
1604
    :param str src, tar: two strings to be compared
1605
    :returns: LCSseq distance
1606
    :rtype: float
1607
1608
    >>> dist_lcsseq('cat', 'hat')
1609
    0.33333333333333337
1610
    >>> dist_lcsseq('Niall', 'Neil')
1611
    0.4
1612
    >>> dist_lcsseq('aluminum', 'Catalan')
1613
    0.625
1614
    >>> dist_lcsseq('ATCG', 'TAGC')
1615
    0.5
1616
    """
1617
    return 1 - sim_lcsseq(src, tar)
1618
1619
1620
def lcsstr(src, tar):
1621
    """Return the longest common substring of two strings.
1622
1623
    Longest common substring (LCSstr)
1624
1625
    Based on the code from
1626
    https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring#Python
1627
    This is licensed Creative Commons: Attribution-ShareAlike 3.0
1628
1629
    Modifications include:
1630
1631
        - conversion to a numpy array in place of a list of lists
1632
        - conversion to Python 2/3-safe range from xrange via six
1633
1634
    :param str src, tar: two strings to be compared
1635
    :returns: the longes common substring
1636
    :rtype: float
1637
1638
    >>> lcsstr('cat', 'hat')
1639
    'at'
1640
    >>> lcsstr('Niall', 'Neil')
1641
    'N'
1642
    >>> lcsstr('aluminum', 'Catalan')
1643
    'al'
1644
    >>> lcsstr('ATCG', 'TAGC')
1645
    'A'
1646
    """
1647
    # pylint: disable=no-member
1648
    lengths = np.zeros((len(src)+1, len(tar)+1), dtype=np.int)
1649
    # pylint: enable=no-member
1650
    longest, i_longest = 0, 0
1651
    for i in range(1, len(src)+1):
1652
        for j in range(1, len(tar)+1):
1653
            if src[i-1] == tar[j-1]:
1654
                lengths[i, j] = lengths[i-1, j-1] + 1
1655
                if lengths[i, j] > longest:
1656
                    longest = lengths[i, j]
1657
                    i_longest = i
1658
            else:
1659
                lengths[i, j] = 0
1660
    return src[i_longest - longest:i_longest]
1661
1662
1663
def sim_lcsstr(src, tar):
1664
    r"""Return the longest common substring similarity of two strings.
1665
1666
    Longest common substring similarity (:math:`sim_{LCSstr}`)
1667
1668
    This employs the LCS function to derive a similarity metric:
1669
    :math:`sim_{LCSstr}(s,t) = \\frac{|LCSstr(s,t)|}{max(|s|, |t|)}`
1670
1671
    :param str src, tar: two strings to be compared
1672
    :returns: LCSstr similarity
1673
    :rtype: float
1674
1675
    >>> sim_lcsstr('cat', 'hat')
1676
    0.6666666666666666
1677
    >>> sim_lcsstr('Niall', 'Neil')
1678
    0.2
1679
    >>> sim_lcsstr('aluminum', 'Catalan')
1680
    0.25
1681
    >>> sim_lcsstr('ATCG', 'TAGC')
1682
    0.25
1683
    """
1684
    if src == tar:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
1685
        return 1.0
1686
    elif not src or not tar:
1687
        return 0.0
1688
    return len(lcsstr(src, tar)) / max(len(src), len(tar))
1689
1690
1691
def dist_lcsstr(src, tar):
1692
    """Return the longest common substring distance between two strings.
1693
1694
    Longest common substring distance (:math:`dist_{LCSstr}`)
1695
1696
    This employs the LCS function to derive a similarity metric:
1697
    :math:`dist_{LCSstr}(s,t) = 1 - sim_{LCSstr}(s,t)`
1698
1699
    :param str src, tar: two strings to be compared
1700
    :returns: LCSstr distance
1701
    :rtype: float
1702
1703
    >>> dist_lcsstr('cat', 'hat')
1704
    0.33333333333333337
1705
    >>> dist_lcsstr('Niall', 'Neil')
1706
    0.8
1707
    >>> dist_lcsstr('aluminum', 'Catalan')
1708
    0.75
1709
    >>> dist_lcsstr('ATCG', 'TAGC')
1710
    0.75
1711
    """
1712
    return 1 - sim_lcsstr(src, tar)
1713
1714
1715
def sim_ratcliff_obershelp(src, tar):
1716
    """Return the Ratcliff-Obershelp similarity of two strings.
1717
1718
    Ratcliff-Obershelp similarity
1719
1720
    This follows the Ratcliff-Obershelp algorithm to derive a similarity
1721
    measure:
1722
1723
        1. Find the length of the longest common substring in src & tar.
1724
        2. Recurse on the strings to the left & right of each this substring
1725
           in src & tar. The base case is a 0 length common substring, in which
1726
           case, return 0. Otherwise, return the sum of the current longest
1727
           common substring and the left & right recursed sums.
1728
        3. Multiply this length by 2 and divide by the sum of the lengths of
1729
           src & tar.
1730
1731
    Cf.
1732
    http://www.drdobbs.com/database/pattern-matching-the-gestalt-approach/184407970
1733
1734
    :param str src, tar: two strings to be compared
1735
    :returns: Ratcliff-Obserhelp similarity
1736
    :rtype: float
1737
1738
    >>> sim_ratcliff_obershelp('cat', 'hat')
1739
    0.66666666666666663
1740
    >>> sim_ratcliff_obershelp('Niall', 'Neil')
1741
    0.66666666666666663
1742
    >>> sim_ratcliff_obershelp('aluminum', 'Catalan')
1743
    0.40000000000000002
1744
    >>> sim_ratcliff_obershelp('ATCG', 'TAGC')
1745
    0.5
1746
    """
1747
    def _lcsstr_stl(src, tar):
1748
        """Return start positions & length for Ratcliff-Obershelp.
1749
1750
        Return the start position in the source string, start position in
1751
        the target string, and length of the longest common substring of
1752
        strings src and tar.
1753
        """
1754
        # pylint: disable=no-member
1755
        lengths = np.zeros((len(src)+1, len(tar)+1), dtype=np.int)
1756
        # pylint: enable=no-member
1757
        longest, src_longest, tar_longest = 0, 0, 0
1758
        for i in range(1, len(src)+1):
1759
            for j in range(1, len(tar)+1):
1760
                if src[i-1] == tar[j-1]:
1761
                    lengths[i, j] = lengths[i-1, j-1] + 1
1762
                    if lengths[i, j] > longest:
1763
                        longest = lengths[i, j]
1764
                        src_longest = i
1765
                        tar_longest = j
1766
                else:
1767
                    lengths[i, j] = 0
1768
        return (src_longest-longest, tar_longest-longest, longest)
1769
1770
    def _sstr_matches(src, tar):
1771
        """Return the sum of substring match lengths.
1772
1773
        This follows the Ratcliff-Obershelp algorithm:
1774
             1. Find the length of the longest common substring in src & tar.
1775
             2. Recurse on the strings to the left & right of each this
1776
                 substring in src & tar.
1777
             3. Base case is a 0 length common substring, in which case,
1778
                 return 0.
1779
             4. Return the sum.
1780
        """
1781
        src_start, tar_start, length = _lcsstr_stl(src, tar)
1782
        if length == 0:
1783
            return 0
1784
        return (_sstr_matches(src[:src_start], tar[:tar_start]) +
1785
                length +
1786
                _sstr_matches(src[src_start+length:], tar[tar_start+length:]))
1787
1788
    if src == tar:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
1789
        return 1.0
1790
    elif not src or not tar:
1791
        return 0.0
1792
    return 2*_sstr_matches(src, tar)/(len(src)+len(tar))
1793
1794
1795
def dist_ratcliff_obershelp(src, tar):
1796
    """Return the Ratcliff-Obershelp distance between two strings.
1797
1798
    Ratcliff-Obershelp distance
1799
1800
    Ratcliff-Obsershelp distance the complement of Ratcliff-Obershelp
1801
    similarity:
1802
    :math:`dist_{Ratcliff-Obershelp} = 1 - sim_{Ratcliff-Obershelp}`
1803
1804
    :param str src, tar: two strings to be compared
1805
    :returns: Ratcliffe-Obershelp distance
1806
    :rtype: float
1807
1808
    >>> dist_ratcliff_obershelp('cat', 'hat')
1809
    0.33333333333333337
1810
    >>> dist_ratcliff_obershelp('Niall', 'Neil')
1811
    0.33333333333333337
1812
    >>> dist_ratcliff_obershelp('aluminum', 'Catalan')
1813
    0.59999999999999998
1814
    >>> dist_ratcliff_obershelp('ATCG', 'TAGC')
1815
    0.5
1816
    """
1817
    return 1 - sim_ratcliff_obershelp(src, tar)
1818
1819
1820
def mra_compare(src, tar):
1821
    """Return the MRA comparison rating of two strings.
1822
1823
    Western Airlines Surname Match Rating Algorithm comparison rating
1824
1825
    A description of the algorithm can be found on page 18 of
1826
    https://archive.org/details/accessingindivid00moor
1827
1828
    :param str src, tar: two strings to be compared
1829
    :returns: MRA comparison rating
1830
    :rtype: int
1831
1832
    >>> mra_compare('cat', 'hat')
1833
    5
1834
    >>> mra_compare('Niall', 'Neil')
1835
    6
1836
    >>> mra_compare('aluminum', 'Catalan')
1837
    0
1838
    >>> mra_compare('ATCG', 'TAGC')
1839
    5
1840
    """
1841
    if src == tar:
1842
        return 6
1843
    if src == '' or tar == '':
1844
        return 0
1845
    src = list(mra(src))
1846
    tar = list(mra(tar))
1847
1848
    if abs(len(src)-len(tar)) > 2:
1849
        return 0
1850
1851
    length_sum = len(src) + len(tar)
1852
    if length_sum < 5:
1853
        min_rating = 5
1854
    elif length_sum < 8:
1855
        min_rating = 4
1856
    elif length_sum < 12:
1857
        min_rating = 3
1858
    else:
1859
        min_rating = 2
1860
1861
    for _ in range(2):
1862
        new_src = []
1863
        new_tar = []
1864
        minlen = min(len(src), len(tar))
1865
        for i in range(minlen):
1866
            if src[i] != tar[i]:
1867
                new_src.append(src[i])
1868
                new_tar.append(tar[i])
1869
        src = new_src+src[minlen:]
1870
        tar = new_tar+tar[minlen:]
1871
        src.reverse()
1872
        tar.reverse()
1873
1874
    similarity = 6 - max(len(src), len(tar))
1875
1876
    if similarity >= min_rating:
1877
        return similarity
1878
    return 0
1879
1880
1881
def sim_mra(src, tar):
1882
    """Return the normalized MRA similarity of two strings.
1883
1884
    Normalized Match Rating Algorithm similarity
1885
1886
    This is the MRA normalized to :math:`[0, 1]`, given that MRA itself is
1887
    constrained to the range :math:`[0, 6]`.
1888
1889
    :param str src, tar: two strings to be compared
1890
    :returns: normalized MRA similarity
1891
    :rtype: float
1892
1893
    >>> sim_mra('cat', 'hat')
1894
    0.8333333333333334
1895
    >>> sim_mra('Niall', 'Neil')
1896
    1.0
1897
    >>> sim_mra('aluminum', 'Catalan')
1898
    0.0
1899
    >>> sim_mra('ATCG', 'TAGC')
1900
    0.8333333333333334
1901
    """
1902
    return mra_compare(src, tar)/6
1903
1904
1905
def dist_mra(src, tar):
1906
    """Return the normalized MRA distance between two strings.
1907
1908
    Normalized Match Rating Algorithm distance
1909
1910
    MRA distance is the complement of MRA similarity:
1911
    :math:`dist_{MRA} = 1 - sim_{MRA}`
1912
1913
    :param str src, tar: two strings to be compared
1914
    :returns: normalized MRA distance
1915
    :rtype: float
1916
1917
    >>> dist_mra('cat', 'hat')
1918
    0.16666666666666663
1919
    >>> dist_mra('Niall', 'Neil')
1920
    0.0
1921
    >>> dist_mra('aluminum', 'Catalan')
1922
    1.0
1923
    >>> dist_mra('ATCG', 'TAGC')
1924
    0.16666666666666663
1925
    """
1926
    return 1 - sim_mra(src, tar)
1927
1928
1929
def dist_compression(src, tar, compressor='bz2', probs=None):
1930
    """Return the normalized compression distance between two strings.
1931
1932
    Normalized compression distance (NCD)
1933
1934
    Cf.
1935
    https://en.wikipedia.org/wiki/Normalized_compression_distance#Normalized_compression_distance
1936
1937
    :param str src, tar: two strings to be compared
1938
    :param str compressor: a compression scheme to use for the similarity
1939
        calculation, from the following:
1940
1941
            - `zlib` -- standard zlib/gzip
1942
            - `bz2` -- bzip2 (default)
1943
            - `lzma` -- Lempel–Ziv–Markov chain algorithm
1944
            - `arith` -- arithmetic coding
1945
            - `rle` -- run-length encoding
1946
            - `bwtrle` -- Burrows-Wheeler transform followed by run-length
1947
              encoding
1948
1949
    :param doct probs: a dictionary trained with ac_train (for the arith
1950
        compressor only)
1951
    :returns: compression distance
1952
    :rtype: float
1953
1954
    >>> dist_compression('cat', 'hat')
1955
    0.08
1956
    >>> dist_compression('Niall', 'Neil')
1957
    0.037037037037037035
1958
    >>> dist_compression('aluminum', 'Catalan')
1959
    0.20689655172413793
1960
    >>> dist_compression('ATCG', 'TAGC')
1961
    0.037037037037037035
1962
1963
    >>> dist_compression('Niall', 'Neil', compressor='zlib')
1964
    0.45454545454545453
1965
    >>> dist_compression('Niall', 'Neil', compressor='bz2')
1966
    0.037037037037037035
1967
    >>> dist_compression('Niall', 'Neil', compressor='lzma')
1968
    0.16
1969
    >>> dist_compression('Niall', 'Neil', compressor='arith')
1970
    0.6875
1971
    >>> dist_compression('Niall', 'Neil', compressor='rle')
1972
    1.0
1973
    >>> dist_compression('Niall', 'Neil', compressor='bwtrle')
1974
    0.8333333333333334
1975
    """
1976
    if src == tar:
1977
        return 0.0
1978
1979
    if compressor not in {'arith', 'rle', 'bwtrle'}:
1980
        src = src.encode('utf-8')
1981
        tar = tar.encode('utf-8')
1982
1983
    if compressor == 'bz2':
1984
        src_comp = codecs.encode(src, 'bz2_codec')[15:]
1985
        tar_comp = codecs.encode(tar, 'bz2_codec')[15:]
1986
        concat_comp = codecs.encode(src+tar, 'bz2_codec')[15:]
1987
        concat_comp2 = codecs.encode(tar+src, 'bz2_codec')[15:]
1988
    elif compressor == 'lzma':
1989
        if 'lzma' in sys.modules:
1990
            src_comp = lzma.compress(src)[14:]
1991
            tar_comp = lzma.compress(tar)[14:]
1992
            concat_comp = lzma.compress(src+tar)[14:]
1993
            concat_comp2 = lzma.compress(tar+src)[14:]
1994
        else:  # pragma: no cover
1995
            raise ValueError('Install the PylibLZMA module in order to use ' +
1996
                             'lzma compression similarity')
1997
    elif compressor == 'arith':
1998
        if probs is None:
1999
            # lacking a reasonable dictionary, train on the strings themselves
2000
            probs = ac_train(src+tar)
2001
        src_comp = ac_encode(src, probs)[1]
2002
        tar_comp = ac_encode(tar, probs)[1]
2003
        concat_comp = ac_encode(src+tar, probs)[1]
2004
        concat_comp2 = ac_encode(tar+src, probs)[1]
2005
        return ((min(concat_comp, concat_comp2) - min(src_comp, tar_comp)) /
2006
                max(src_comp, tar_comp))
2007
    elif compressor in {'rle', 'bwtrle'}:
2008
        src_comp = rle_encode(src, (compressor == 'bwtrle'))
2009
        tar_comp = rle_encode(tar, (compressor == 'bwtrle'))
2010
        concat_comp = rle_encode(src+tar, (compressor == 'bwtrle'))
2011
        concat_comp2 = rle_encode(tar+src, (compressor == 'bwtrle'))
2012
    else:  # zlib
2013
        src_comp = codecs.encode(src, 'zlib_codec')[2:]
2014
        tar_comp = codecs.encode(tar, 'zlib_codec')[2:]
2015
        concat_comp = codecs.encode(src+tar, 'zlib_codec')[2:]
2016
        concat_comp2 = codecs.encode(tar+src, 'zlib_codec')[2:]
2017
    return ((min(len(concat_comp), len(concat_comp2)) -
2018
             min(len(src_comp), len(tar_comp))) /
2019
            max(len(src_comp), len(tar_comp)))
2020
2021
2022
def sim_compression(src, tar, compressor='bz2', probs=None):
2023
    """Return the normalized compression similarity of two strings.
2024
2025
    Normalized compression similarity (NCS)
2026
2027
    Normalized compression similarity is the complement of normalized
2028
    compression distance:
2029
    :math:`sim_{NCS} = 1 - dist_{NCD}`
2030
2031
    :param str src, tar: two strings to be compared
2032
    :param str compressor: a compression scheme to use for the similarity
2033
        calculation:
2034
2035
            - `zlib` -- standard zlib/gzip
2036
            - `bz2` -- bzip2 (default)
2037
            - `lzma` -- Lempel–Ziv–Markov chain algorithm
2038
            - `arith` -- arithmetic coding
2039
            - `rle` -- run-length encoding
2040
            - `bwtrle` -- Burrows-Wheeler transform followed by run-length
2041
              encoding
2042
2043
    :param dict probs: a dictionary trained with ac_train (for the arith
2044
        compressor only)
2045
    :returns: compression similarity
2046
    :rtype: float
2047
2048
    >>> sim_compression('cat', 'hat')
2049
    0.92
2050
    >>> sim_compression('Niall', 'Neil')
2051
    0.962962962962963
2052
    >>> sim_compression('aluminum', 'Catalan')
2053
    0.7931034482758621
2054
    >>> sim_compression('ATCG', 'TAGC')
2055
    0.962962962962963
2056
2057
    >>> sim_compression('Niall', 'Neil', compressor='zlib')
2058
    0.5454545454545454
2059
    >>> sim_compression('Niall', 'Neil', compressor='bz2')
2060
    0.962962962962963
2061
    >>> sim_compression('Niall', 'Neil', compressor='lzma')
2062
    0.84
2063
    >>> sim_compression('Niall', 'Neil', compressor='arith')
2064
    0.3125
2065
    >>> sim_compression('Niall', 'Neil', compressor='rle')
2066
    0.0
2067
    >>> sim_compression('Niall', 'Neil', compressor='bwtrle')
2068
    0.16666666666666663
2069
    """
2070
    return 1 - dist_compression(src, tar, compressor, probs)
2071
2072
2073
def sim_monge_elkan(src, tar, sim_func=sim_levenshtein, symmetric=False):
2074
    """Return the Monge-Elkan similarity of two strings.
2075
2076
    Monge-Elkan similarity
2077
2078
    Monge-Elkan is defined in:
2079
    Monge, Alvaro E. and Charles P. Elkan. 1996. "The field matching problem:
2080
    Algorithms and applications." KDD-9 Proceedings.
2081
    http://www.aaai.org/Papers/KDD/1996/KDD96-044.pdf
2082
2083
    Note: Monge-Elkan is NOT a symmetric similarity algoritm. Thus, the
2084
    similarity of src to tar is not necessarily equal to the similarity of
2085
    tar to src. If the sym argument is True, a symmetric value is calculated,
2086
    at the cost of doubling the computation time (since the
2087
    :math:`sim_{Monge-Elkan}(src, tar)` and
2088
    :math:`sim_{Monge-Elkan}(tar, src)` are both calculated and then averaged).
2089
2090
    :param str src, tar: two strings to be compared
2091
    :param function sim_func: the internal similarity metric to emply
2092
    :param bool symmetric: return a symmetric similarity measure
2093
    :returns: Monge-Elkan similarity
2094
    :rtype: float
2095
2096
    >>> sim_monge_elkan('cat', 'hat')
2097
    0.75
2098
    >>> sim_monge_elkan('Niall', 'Neil')
2099
    0.66666666666666663
2100
    >>> sim_monge_elkan('aluminum', 'Catalan')
2101
    0.3888888888888889
2102
    >>> sim_monge_elkan('ATCG', 'TAGC')
2103
    0.5
2104
    """
2105
    if src == tar:
2106
        return 1.0
2107
2108
    q_src = sorted(QGrams(src).elements())
2109
    q_tar = sorted(QGrams(tar).elements())
2110
2111
    if not q_src or not q_tar:
2112
        return 0.0
2113
2114
    sum_of_maxes = 0
2115
    for q_s in q_src:
2116
        max_sim = float('-inf')
2117
        for q_t in q_tar:
2118
            max_sim = max(max_sim, sim_func(q_s, q_t))
2119
        sum_of_maxes += max_sim
2120
    sim_em = sum_of_maxes / len(q_src)
2121
2122
    if symmetric:
2123
        sim_em = (sim_em + sim_monge_elkan(tar, src, sim, False))/2
2124
2125
    return sim_em
2126
2127
2128
def dist_monge_elkan(src, tar, sim_func=sim_levenshtein, symmetric=False):
2129
    """Return the Monge-Elkan distance between two strings.
2130
2131
    Monge-Elkan distance
2132
2133
    Monge-Elkan is defined in:
2134
    Monge, Alvaro E. and Charles P. Elkan. 1996. "The field matching problem:
2135
    Algorithms and applications." KDD-9 Proceedings.
2136
    http://www.aaai.org/Papers/KDD/1996/KDD96-044.pdf
2137
2138
    Note: Monge-Elkan is NOT a symmetric similarity algoritm. Thus, the
2139
    distance between src and tar is not necessarily equal to the distance
2140
    between tar and src. If the sym argument is True, a symmetric value is
2141
    calculated, at the cost of doubling the computation time (since the
2142
    :math:`sim_{Monge-Elkan}(src, tar)` and :math:`sim_{Monge-Elkan}(tar, src)`
2143
    are both calculated and then averaged).
2144
2145
    :param str src, tar: two strings to be compared
2146
    :param function sim_func: the internal similarity metric to emply
2147
    :param bool symmetric: return a symmetric similarity measure
2148
    :returns: Monge-Elkan distance
2149
    :rtype: float
2150
2151
    >>> dist_monge_elkan('cat', 'hat')
2152
    0.25
2153
    >>> dist_monge_elkan('Niall', 'Neil')
2154
    0.33333333333333337
2155
    >>> dist_monge_elkan('aluminum', 'Catalan')
2156
    0.61111111111111116
2157
    >>> dist_monge_elkan('ATCG', 'TAGC')
2158
    0.5
2159
    """
2160
    return 1 - sim_monge_elkan(src, tar, sim_func, symmetric)
2161
2162
2163
def sim_ident(src, tar):
2164
    """Return the identity similarity of two strings.
2165
2166
    Identity similarity
2167
2168
    This is 1 if the two strings are identical, otherwise 0.
2169
2170
    :param str src, tar: two strings to be compared
2171
    :returns: identity similarity
2172
    :rtype: int
2173
2174
    >>> sim_ident('cat', 'hat')
2175
    0
2176
    >>> sim_ident('cat', 'cat')
2177
    1
2178
    """
2179
    return int(src == tar)
2180
2181
2182
def dist_ident(src, tar):
2183
    """Return the identity distance between two strings.
2184
2185
    Identity distance
2186
2187
    This is 0 if the two strings are identical, otherwise 1, i.e.
2188
    :math:`dist_{identity} = 1 - sim_{identity}`
2189
2190
    :param str src, tar: two strings to be compared
2191
    :returns: indentity distance
2192
    :rtype: int
2193
2194
    >>> dist_ident('cat', 'hat')
2195
    1
2196
    >>> dist_ident('cat', 'cat')
2197
    0
2198
    """
2199
    return 1 - sim_ident(src, tar)
2200
2201
2202
def sim_matrix(src, tar, mat=None, mismatch_cost=0, match_cost=1,
0 ignored issues
show
best-practice introduced by
Too many arguments (7/5)
Loading history...
2203
               symmetric=True, alphabet=None):
2204
    """Return the matrix similarity of two strings.
2205
2206
    Matrix similarity
2207
2208
    With the default parameters, this is identical to sim_ident.
2209
    It is possible for sim_matrix to return values outside of the range
2210
    :math:`[0, 1]`, if values outside that range are present in mat,
2211
    mismatch_cost, or match_cost.
2212
2213
    :param str src, tar: two strings to be compared
2214
    :param dict mat: a dict mapping tuples to costs; the tuples are (src, tar)
2215
        pairs of symbols from the alphabet parameter
2216
    :param float mismatch_cost: the value returned if (src, tar) is absent from
2217
        mat when src does not equal tar
2218
    :param float match_cost: the value returned if (src, tar) is absent from
2219
        mat when src equals tar
2220
    :param bool symmetric: True if the cost of src not matching tar is
2221
        identical to the cost of tar not matching src; in this case, the values
2222
        in mat need only contain (src, tar) or (tar, src), not both
2223
    :param str alphabet: a collection of tokens from which src and tar are
2224
        drawn; if this is defined a ValueError is raised if either tar or src
2225
        is not found in alphabet
2226
    :returns: matrix similarity
2227
    :rtype: float
2228
2229
    >>> sim_matrix('cat', 'hat')
2230
    0
2231
    >>> sim_matrix('hat', 'hat')
2232
    1
2233
    """
2234
    if alphabet:
2235
        alphabet = tuple(alphabet)
2236
        for i in src:
2237
            if i not in alphabet:
2238
                raise ValueError('src value not in alphabet')
2239
        for i in tar:
2240
            if i not in alphabet:
2241
                raise ValueError('tar value not in alphabet')
2242
2243
    if src == tar:
2244
        if mat and (src, src) in mat:
2245
            return mat[(src, src)]
2246
        return match_cost
2247
    if mat and (src, tar) in mat:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
2248
        return mat[(src, tar)]
2249
    elif symmetric and mat and (tar, src) in mat:
2250
        return mat[(tar, src)]
2251
    return mismatch_cost
2252
2253
2254 View Code Duplication
def needleman_wunsch(src, tar, gap_cost=1, sim_func=sim_ident):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
2255
    """Return the Needleman-Wunsch score of two strings.
2256
2257
    Needleman-Wunsch score
2258
2259
    This is the standard edit distance measure.
2260
2261
    Cf. https://en.wikipedia.org/wiki/Needleman–Wunsch_algorithm
2262
2263
    Cf.
2264
    http://csb.stanford.edu/class/public/readings/Bioinformatics_I_Lecture6/Needleman_Wunsch_JMB_70_Global_alignment.pdf
2265
2266
    :param str src, tar: two strings to be compared
2267
    :param float gap_cost: the cost of an alignment gap (1 by default)
2268
    :param function sim_func: a function that returns the similarity of two
2269
        characters (identity similarity by default)
2270
    :returns: Needleman-Wunsch score
2271
    :rtype: int (in fact dependent on the gap_cost & return value of sim_func)
2272
2273
    >>> needleman_wunsch('cat', 'hat')
2274
    2.0
2275
    >>> needleman_wunsch('Niall', 'Neil')
2276
    1.0
2277
    >>> needleman_wunsch('aluminum', 'Catalan')
2278
    -1.0
2279
    >>> needleman_wunsch('ATCG', 'TAGC')
2280
    0.0
2281
    """
2282
    # pylint: disable=no-member
2283
    d_mat = np.zeros((len(src)+1, len(tar)+1), dtype=np.float)
2284
    # pylint: enable=no-member
2285
2286
    for i in range(len(src)+1):
2287
        d_mat[i, 0] = -(i * gap_cost)
2288
    for j in range(len(tar)+1):
2289
        d_mat[0, j] = -(j * gap_cost)
2290
    for i in range(1, len(src)+1):
2291
        for j in range(1, len(tar)+1):
2292
            match = d_mat[i-1, j-1] + sim_func(src[i-1], tar[j-1])
2293
            delete = d_mat[i-1, j] - gap_cost
2294
            insert = d_mat[i, j-1] - gap_cost
2295
            d_mat[i, j] = max(match, delete, insert)
2296
    return d_mat[d_mat.shape[0]-1, d_mat.shape[1]-1]
2297
2298
2299 View Code Duplication
def smith_waterman(src, tar, gap_cost=1, sim_func=sim_ident):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
2300
    """Return the Smith-Waterman score of two strings.
2301
2302
    Smith-Waterman score
2303
2304
    This is the standard edit distance measure.
2305
2306
    Cf. https://en.wikipedia.org/wiki/Smith–Waterman_algorithm
2307
2308
    :param str src, tar: two strings to be compared
2309
    :param float gap_cost: the cost of an alignment gap (1 by default)
2310
    :param function sim_func: a function that returns the similarity of two
2311
        characters (identity similarity by default)
2312
    :returns: Smith-Waterman score
2313
    :rtype: int (in fact dependent on the gap_cost & return value of sim_func)
2314
2315
    >>> smith_waterman('cat', 'hat')
2316
    2.0
2317
    >>> smith_waterman('Niall', 'Neil')
2318
    1.0
2319
    >>> smith_waterman('aluminum', 'Catalan')
2320
    0.0
2321
    >>> smith_waterman('ATCG', 'TAGC')
2322
    1.0
2323
    """
2324
    # pylint: disable=no-member
2325
    d_mat = np.zeros((len(src)+1, len(tar)+1), dtype=np.float)
2326
    # pylint: enable=no-member
2327
2328
    for i in range(len(src)+1):
2329
        d_mat[i, 0] = 0
2330
    for j in range(len(tar)+1):
2331
        d_mat[0, j] = 0
2332
    for i in range(1, len(src)+1):
2333
        for j in range(1, len(tar)+1):
2334
            match = d_mat[i-1, j-1] + sim_func(src[i-1], tar[j-1])
2335
            delete = d_mat[i-1, j] - gap_cost
2336
            insert = d_mat[i, j-1] - gap_cost
2337
            d_mat[i, j] = max(0, match, delete, insert)
2338
    return d_mat[d_mat.shape[0]-1, d_mat.shape[1]-1]
2339
2340
2341
def gotoh(src, tar, gap_open=1, gap_ext=0.4, sim_func=sim_ident):
2342
    """Return the Gotoh score of two strings.
2343
2344
    Gotoh score
2345
2346
    Gotoh's algorithm is essentially Needleman-Wunsch with affine gap
2347
    penalties:
2348
    https://www.cs.umd.edu/class/spring2003/cmsc838t/papers/gotoh1982.pdf
2349
2350
    :param str src, tar: two strings to be compared
2351
    :param float gap_open: the cost of an open alignment gap (1 by default)
2352
    :param float gap_ext: the cost of an alignment gap extension (0.4 by
2353
        default)
2354
    :param function sim_func: a function that returns the similarity of two
2355
        characters (identity similarity by default)
2356
    :returns: Gotoh score
2357
    :rtype: float (in fact dependent on the gap_cost & return value of
2358
        sim_func)
2359
2360
    >>> gotoh('cat', 'hat')
2361
    2.0
2362
    >>> gotoh('Niall', 'Neil')
2363
    1.0
2364
    >>> gotoh('aluminum', 'Catalan')
2365
    -0.40000000000000002
2366
    >>> gotoh('cat', 'hat')
2367
    2.0
2368
    """
2369
    # pylint: disable=no-member
2370
    d_mat = np.zeros((len(src)+1, len(tar)+1), dtype=np.float)
2371
    p_mat = np.zeros((len(src)+1, len(tar)+1), dtype=np.float)
2372
    q_mat = np.zeros((len(src)+1, len(tar)+1), dtype=np.float)
2373
    # pylint: enable=no-member
2374
2375
    d_mat[0, 0] = 0
2376
    p_mat[0, 0] = float('-inf')
2377
    q_mat[0, 0] = float('-inf')
2378
    for i in range(1, len(src)+1):
2379
        d_mat[i, 0] = float('-inf')
2380
        p_mat[i, 0] = -gap_open - gap_ext*(i-1)
2381
        q_mat[i, 0] = float('-inf')
2382
        q_mat[i, 1] = -gap_open
2383
    for j in range(1, len(tar)+1):
2384
        d_mat[0, j] = float('-inf')
2385
        p_mat[0, j] = float('-inf')
2386
        p_mat[1, j] = -gap_open
2387
        q_mat[0, j] = -gap_open - gap_ext*(j-1)
2388
2389
    for i in range(1, len(src)+1):
2390
        for j in range(1, len(tar)+1):
2391
            sim_val = sim_func(src[i-1], tar[j-1])
2392
            d_mat[i, j] = max(d_mat[i-1, j-1] + sim_val,
2393
                              p_mat[i-1, j-1] + sim_val,
2394
                              q_mat[i-1, j-1] + sim_val)
2395
2396
            p_mat[i, j] = max(d_mat[i-1, j] - gap_open,
2397
                              p_mat[i-1, j] - gap_ext)
2398
2399
            q_mat[i, j] = max(d_mat[i, j-1] - gap_open,
2400
                              q_mat[i, j-1] - gap_ext)
2401
2402
    i, j = (n - 1 for n in d_mat.shape)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable n does not seem to be defined.
Loading history...
2403
    return max(d_mat[i, j], p_mat[i, j], q_mat[i, j])
2404
2405
2406
def sim_length(src, tar):
2407
    """Return the length similarty of two strings.
2408
2409
    Length similarity
2410
2411
    This is the ratio of the length of the shorter string to the longer.
2412
2413
    :param str src, tar: two strings to be compared
2414
    :returns: length similarity
2415
    :rtype: float
2416
2417
    >>> sim_length('cat', 'hat')
2418
    1.0
2419
    >>> sim_length('Niall', 'Neil')
2420
    0.8
2421
    >>> sim_length('aluminum', 'Catalan')
2422
    0.875
2423
    >>> sim_length('ATCG', 'TAGC')
2424
    1.0
2425
    """
2426
    if src == tar:
2427
        return 1.0
2428
    if not src or not tar:
2429
        return 0.0
2430
    return len(src)/len(tar) if len(src) < len(tar) else len(tar)/len(src)
2431
2432
2433
def dist_length(src, tar):
2434
    """Return the length distance between two strings.
2435
2436
    Length distance
2437
2438
    Length distance is the complement of length similarity:
2439
    :math:`dist_{length} = 1 - sim_{length}`
2440
2441
    :param str src, tar: two strings to be compared
2442
    :returns: length distance
2443
    :rtype: float
2444
2445
    >>> dist_length('cat', 'hat')
2446
    0.0
2447
    >>> dist_length('Niall', 'Neil')
2448
    0.19999999999999996
2449
    >>> dist_length('aluminum', 'Catalan')
2450
    0.125
2451
    >>> dist_length('ATCG', 'TAGC')
2452
    0.0
2453
    """
2454
    return 1 - sim_length(src, tar)
2455
2456
2457 View Code Duplication
def sim_prefix(src, tar):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
2458
    """Return the prefix similarty of two strings.
2459
2460
    Prefix similarity
2461
2462
    Prefix similarity is the ratio of the length of the shorter term that
2463
    exactly matches the longer term to the length of the shorter term,
2464
    beginning at the start of both terms.
2465
2466
    :param str src, tar: two strings to be compared
2467
    :returns: prefix similarity
2468
    :rtype: float
2469
2470
    >>> sim_prefix('cat', 'hat')
2471
    0.0
2472
    >>> sim_prefix('Niall', 'Neil')
2473
    0.25
2474
    >>> sim_prefix('aluminum', 'Catalan')
2475
    0.0
2476
    >>> sim_prefix('ATCG', 'TAGC')
2477
    0.0
2478
    """
2479
    if src == tar:
2480
        return 1.0
2481
    if not src or not tar:
2482
        return 0.0
2483
    min_word, max_word = (src, tar) if len(src) < len(tar) else (tar, src)
2484
    min_len = len(min_word)
2485
    for i in range(min_len, 0, -1):
2486
        if min_word[:i] == max_word[:i]:
2487
            return i/min_len
2488
    return 0.0
2489
2490
2491
def dist_prefix(src, tar):
2492
    """Return the prefix distance between two strings.
2493
2494
    Prefix distance
2495
2496
    Prefix distance is the complement of prefix similarity:
2497
    :math:`dist_{prefix} = 1 - sim_{prefix}`
2498
2499
    :param str src, tar: two strings to be compared
2500
    :returns: prefix distance
2501
    :rtype: float
2502
2503
        >>> dist_prefix('cat', 'hat')
2504
    1.0
2505
    >>> dist_prefix('Niall', 'Neil')
2506
    0.75
2507
    >>> dist_prefix('aluminum', 'Catalan')
2508
    1.0
2509
    >>> dist_prefix('ATCG', 'TAGC')
2510
    1.0
2511
    """
2512
    return 1 - sim_prefix(src, tar)
2513
2514
2515 View Code Duplication
def sim_suffix(src, tar):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
2516
    """Return the suffix similarity of two strings.
2517
2518
    Suffix similarity
2519
2520
    Suffix similarity is the ratio of the length of the shorter term that
2521
    exactly matches the longer term to the length of the shorter term,
2522
    beginning at the end of both terms.
2523
2524
    :param str src, tar: two strings to be compared
2525
    :returns: suffix similarity
2526
    :rtype: float
2527
2528
    >>> sim_suffix('cat', 'hat')
2529
    0.6666666666666666
2530
    >>> sim_suffix('Niall', 'Neil')
2531
    0.25
2532
    >>> sim_suffix('aluminum', 'Catalan')
2533
    0.0
2534
    >>> sim_suffix('ATCG', 'TAGC')
2535
    0.0
2536
    """
2537
    if src == tar:
2538
        return 1.0
2539
    if not src or not tar:
2540
        return 0.0
2541
    min_word, max_word = (src, tar) if len(src) < len(tar) else (tar, src)
2542
    min_len = len(min_word)
2543
    for i in range(min_len, 0, -1):
2544
        if min_word[-i:] == max_word[-i:]:
2545
            return i/min_len
2546
    return 0.0
2547
2548
2549
def dist_suffix(src, tar):
2550
    """Return the suffix distance between two strings.
2551
2552
    Suffix distance
2553
2554
    Suffix distance is the complement of suffix similarity:
2555
    :math:`dist_{suffix} = 1 - sim_{suffix}`
2556
2557
    :param str src, tar: two strings to be compared
2558
    :returns: suffix distance
2559
    :rtype: float
2560
2561
    >>> dist_suffix('cat', 'hat')
2562
    0.33333333333333337
2563
    >>> dist_suffix('Niall', 'Neil')
2564
    0.75
2565
    >>> dist_suffix('aluminum', 'Catalan')
2566
    1.0
2567
    >>> dist_suffix('ATCG', 'TAGC')
2568
    1.0
2569
    """
2570
    return 1 - sim_suffix(src, tar)
2571
2572
2573
def sim_mlipns(src, tar, threshold=0.25, maxmismatches=2):
2574
    """Return the MLIPNS similarity of two strings.
2575
2576
    Modified Language-Independent Product Name Search (MLIPNS)
2577
2578
    The MLIPNS algorithm is described in Shannaq, Boumedyen A. N. and Victor V.
2579
    Alexandrov. 2010. "Using Product Similarity for Adding Business." Global
2580
    Journal of Computer Science and Technology. 10(12). 2-8.
2581
    http://www.sial.iias.spb.su/files/386-386-1-PB.pdf
2582
2583
    This function returns only 1.0 (similar) or 0.0 (not similar).
2584
2585
    LIPNS similarity is identical to normalized Hamming similarity.
2586
2587
    :param str src, tar: two strings to be compared
2588
    :param float threshold: a number [0, 1] indicating the maximum similarity
2589
        score, below which the strings are considered 'similar' (0.25 by
2590
        default)
2591
    :param int maxmismatches: a number indicating the allowable number of
2592
        mismatches to remove before declaring two strings not similar (2 by
2593
        default)
2594
    :returns: MLIPNS similarity
2595
    :rtype: float
2596
2597
    >>> sim_mlipns('cat', 'hat')
2598
    1.0
2599
    >>> sim_mlipns('Niall', 'Neil')
2600
    0.0
2601
    >>> sim_mlipns('aluminum', 'Catalan')
2602
    0.0
2603
    >>> sim_mlipns('ATCG', 'TAGC')
2604
    0.0
2605
    """
2606
    if tar == src:
2607
        return 1.0
2608
    if not src or not tar:
2609
        return 0.0
2610
2611
    mismatches = 0
2612
    ham = hamming(src, tar, difflens=True)
2613
    maxlen = max(len(src), len(tar))
2614
    while src and tar and mismatches <= maxmismatches:
2615
        if maxlen < 1 or (1-(maxlen-ham)/maxlen) <= threshold:
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
2616
            return 1.0
2617
        else:
2618
            mismatches += 1
2619
            ham -= 1
2620
            maxlen -= 1
2621
2622
    if maxlen < 1:
2623
        return 1.0
2624
    return 0.0
2625
2626
2627
def dist_mlipns(src, tar, threshold=0.25, maxmismatches=2):
2628
    """Return the MLIPNS distance between two strings.
2629
2630
    Modified Language-Independent Product Name Search (MLIPNS)
2631
2632
    MLIPNS distance is the complement of MLIPNS similarity:
2633
    :math:`dist_{MLIPNS} = 1 - sim_{MLIPNS}`
2634
2635
    This function returns only 0.0 (distant) or 1.0 (not distant)
2636
2637
    :param str src, tar: two strings to be compared
2638
    :param float threshold: a number [0, 1] indicating the maximum similarity
2639
        score, below which the strings are considered 'similar' (0.25 by
2640
        default)
2641
    :param int maxmismatches: a number indicating the allowable number of
2642
        mismatches to remove before declaring two strings not similar (2 by
2643
        default)
2644
    :returns: MLIPNS distance
2645
    :rtype: float
2646
2647
    >>> dist_mlipns('cat', 'hat')
2648
    0.0
2649
    >>> dist_mlipns('Niall', 'Neil')
2650
    1.0
2651
    >>> dist_mlipns('aluminum', 'Catalan')
2652
    1.0
2653
    >>> dist_mlipns('ATCG', 'TAGC')
2654
    1.0
2655
    """
2656
    return 1.0 - sim_mlipns(src, tar, threshold, maxmismatches)
2657
2658
2659
def bag(src, tar):
2660
    """Return the bag distance between two strings.
2661
2662
    Bag distance
2663
2664
    Bag distance is proposed in Bartolini, Illaria, Paolo Ciaccia, and Marco
2665
    Patella. 2002. "String Matching with Metric Trees Using and Approximate
2666
    Distance. Proceedings of the 9th International Symposium on String
2667
    Processing and Information Retrieval, Lisbone, Portugal, September 2002.
2668
    271-283.
2669
    http://www-db.disi.unibo.it/research/papers/SPIRE02.pdf
2670
2671
    It is defined as:
2672
    :math:`max( |multiset(src)-multiset(tar)|, |multiset(tar)-multiset(src)| )`
2673
2674
    :param str src, tar: two strings to be compared
2675
    :returns: bag distance
2676
    :rtype: int
2677
2678
    >>> bag('cat', 'hat')
2679
    1
2680
    >>> bag('Niall', 'Neil')
2681
    2
2682
    >>> bag('aluminum', 'Catalan')
2683
    5
2684
    >>> bag('ATCG', 'TAGC')
2685
    0
2686
    >>> bag('abcdefg', 'hijklm')
2687
    7
2688
    >>> bag('abcdefg', 'hijklmno')
2689
    8
2690
    """
2691
    if tar == src:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
2692
        return 0
2693
    elif not src:
2694
        return len(tar)
2695
    elif not tar:
2696
        return len(src)
2697
2698
    src_bag = Counter(src)
2699
    tar_bag = Counter(tar)
2700
    return max(sum((src_bag-tar_bag).values()),
2701
               sum((tar_bag-src_bag).values()))
2702
2703
2704
def dist_bag(src, tar):
2705
    """Return the normalized bag distance between two strings.
2706
2707
    Normalized bag distance
2708
2709
    Bag distance is normalized by dividing by :math:`max( |src|, |tar| )`.
2710
2711
    :param str src, tar: two strings to be compared
2712
    :returns: normalized bag distance
2713
    :rtype: float
2714
2715
    >>> dist_bag('cat', 'hat')
2716
    0.3333333333333333
2717
    >>> dist_bag('Niall', 'Neil')
2718
    0.4
2719
    >>> dist_bag('aluminum', 'Catalan')
2720
    0.375
2721
    >>> dist_bag('ATCG', 'TAGC')
2722
    0.0
2723
    """
2724
    if tar == src:
2725
        return 0.0
2726
    if not src or not tar:
2727
        return 1.0
2728
2729
    maxlen = max(len(src), len(tar))
2730
2731
    return bag(src, tar)/maxlen
2732
2733
2734
def sim_bag(src, tar):
2735
    """Return the normalized bag similarity of two strings.
2736
2737
    Normalized bag similarity
2738
2739
    Normalized bag similarity is the complement of normalized bag distance:
2740
    :math:`sim_{bag} = 1 - dist_{bag}`
2741
2742
    :param str src, tar: two strings to be compared
2743
    :returns: normalized bag similarity
2744
    :rtype: float
2745
2746
    >>> sim_bag('cat', 'hat')
2747
    0.6666666666666667
2748
    >>> sim_bag('Niall', 'Neil')
2749
    0.6
2750
    >>> sim_bag('aluminum', 'Catalan')
2751
    0.625
2752
    >>> sim_bag('ATCG', 'TAGC')
2753
    1.0
2754
    """
2755
    return 1-dist_bag(src, tar)
2756
2757
2758
def editex(src, tar, cost=(0, 1, 2), local=False):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (16/15).
Loading history...
2759
    """Return the Editex distance between two strings.
2760
2761
    Editex distance
2762
2763
    As described on pages 3 & 4 of
2764
    Zobel, Justin and Philip Dart. 1996. Phonetic string matching: Lessons from
2765
    information retrieval. In: Proceedings of the ACM-SIGIR Conference on
2766
    Research and Development in Information Retrieval, Zurich, Switzerland.
2767
    166–173. http://goanna.cs.rmit.edu.au/~jz/fulltext/sigir96.pdf
2768
2769
    The local variant is based on
2770
    Ring, Nicholas and Alexandra L. Uitdenbogerd. 2009. Finding ‘Lucy in
2771
    Disguise’: The Misheard Lyric Matching Problem. In: Proceedings of the 5th
2772
    Asia Information Retrieval Symposium, Sapporo, Japan. 157-167.
2773
    http://www.seg.rmit.edu.au/research/download.php?manuscript=404
2774
2775
    :param str src, tar: two strings to be compared
2776
    :param tuple cost: a 3-tuple representing the cost of the four possible
2777
        edits:
2778
        match, same-group, and mismatch respectively (by default: (0, 1, 2))
2779
    :param bool local: if True, the local variant of Editex is used
2780
    :returns: Editex distance
2781
    :rtype: int
2782
2783
    >>> editex('cat', 'hat')
2784
    2
2785
    >>> editex('Niall', 'Neil')
2786
    2
2787
    >>> editex('aluminum', 'Catalan')
2788
    12
2789
    >>> editex('ATCG', 'TAGC')
2790
    6
2791
    """
2792
    match_cost, group_cost, mismatch_cost = cost
2793
    letter_groups = ({'A', 'E', 'I', 'O', 'U', 'Y'},
2794
                     {'B', 'P'},
2795
                     {'C', 'K', 'Q'},
2796
                     {'D', 'T'},
2797
                     {'L', 'R'},
2798
                     {'M', 'N'},
2799
                     {'G', 'J'},
2800
                     {'F', 'P', 'V'},
2801
                     {'S', 'X', 'Z'},
2802
                     {'C', 'S', 'Z'})
2803
    all_letters = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'J', 'K', 'L', 'M',
2804
                   'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'X', 'Y', 'Z'}
2805
2806
    def r_cost(ch1, ch2):
2807
        """Return r(a,b) according to Zobel & Dart's definition."""
2808
        if ch1 == ch2:
2809
            return match_cost
2810
        if ch1 in all_letters and ch2 in all_letters:
2811
            for group in letter_groups:
2812
                if ch1 in group and ch2 in group:
2813
                    return group_cost
2814
        return mismatch_cost
2815
2816
    def d_cost(ch1, ch2):
2817
        """Return d(a,b) according to Zobel & Dart's definition."""
2818
        if ch1 != ch2 and (ch1 == 'H' or ch1 == 'W'):
0 ignored issues
show
Unused Code introduced by
Consider merging these comparisons with "in" to "ch1 in ('H', 'W')"
Loading history...
2819
            return group_cost
2820
        return r_cost(ch1, ch2)
2821
2822
    # convert both src & tar to NFKD normalized unicode
2823
    src = unicodedata.normalize('NFKD', text_type(src.upper()))
2824
    tar = unicodedata.normalize('NFKD', text_type(tar.upper()))
2825
    # convert ß to SS (for Python2)
2826
    src = src.replace('ß', 'SS')
2827
    tar = tar.replace('ß', 'SS')
2828
2829
    if src == tar:
2830
        return 0
2831
    if not src:
2832
        return len(tar) * mismatch_cost
2833
    if not tar:
2834
        return len(src) * mismatch_cost
2835
2836
    # pylint: disable=no-member
2837
    d_mat = np.zeros((len(src)+1, len(tar)+1), dtype=np.int)
2838
    # pylint: enable=no-member
2839
    lens = len(src)
2840
    lent = len(tar)
2841
    src = ' '+src
2842
    tar = ' '+tar
2843
2844
    if not local:
2845
        for i in range(1, lens+1):
2846
            d_mat[i, 0] = d_mat[i-1, 0] + d_cost(src[i-1], src[i])
2847
    for j in range(1, lent+1):
2848
        d_mat[0, j] = d_mat[0, j-1] + d_cost(tar[j-1], tar[j])
2849
2850
    for i in range(1, lens+1):
2851
        for j in range(1, lent+1):
2852
            d_mat[i, j] = min(d_mat[i-1, j] + d_cost(src[i-1], src[i]),
2853
                              d_mat[i, j-1] + d_cost(tar[j-1], tar[j]),
2854
                              d_mat[i-1, j-1] + r_cost(src[i], tar[j]))
2855
2856
    return d_mat[lens, lent]
2857
2858
2859
def dist_editex(src, tar, cost=(0, 1, 2), local=False):
2860
    """Return the normalized Editex distance between two strings.
2861
2862
    Editex distance normalized to the interval [0, 1]
2863
2864
    The Editex distance is normalized by dividing the Editex distance
2865
    (calculated by any of the three supported methods) by the greater of
2866
    the number of characters in src times the cost of a delete and
2867
    the number of characters in tar times the cost of an insert.
2868
    For the case in which all operations have :math:`cost = 1`, this is
2869
    equivalent to the greater of the length of the two strings src & tar.
2870
2871
    :param str src, tar: two strings to be compared
2872
    :param tuple cost: a 3-tuple representing the cost of the four possible
2873
        edits:
2874
        match, same-group, and mismatch respectively (by default: (0, 1, 2))
2875
    :param bool local: if True, the local variant of Editex is used
2876
    :returns: normalized Editex distance
2877
    :rtype: float
2878
2879
    >>> dist_editex('cat', 'hat')
2880
    0.33333333333333331
2881
    >>> dist_editex('Niall', 'Neil')
2882
    0.20000000000000001
2883
    >>> dist_editex('aluminum', 'Catalan')
2884
    0.75
2885
    >>> dist_editex('ATCG', 'TAGC')
2886
    0.75
2887
    """
2888
    if src == tar:
2889
        return 0
2890
    mismatch_cost = cost[2]
2891
    return (editex(src, tar, cost, local) /
2892
            (max(len(src)*mismatch_cost, len(tar)*mismatch_cost)))
2893
2894
2895
def sim_editex(src, tar, cost=(0, 1, 2), local=False):
2896
    """Return the normalized Editex similarity of two strings.
2897
2898
    Editex similarity normalized to the interval [0, 1]
2899
2900
    The Editex similarity is the complement of Editex distance
2901
    :math:`sim_{Editex} = 1 - dist_{Editex}`
2902
2903
    The arguments are identical to those of the editex() function.
2904
2905
    :param str src, tar: two strings to be compared
2906
    :param tuple cost: a 3-tuple representing the cost of the four possible
2907
        edits:
2908
        match, same-group, and mismatch respectively (by default: (0, 1, 2))
2909
    :param bool local: if True, the local variant of Editex is used
2910
    :returns: normalized Editex similarity
2911
    :rtype: float
2912
2913
    >>> sim_editex('cat', 'hat')
2914
    0.66666666666666674
2915
    >>> sim_editex('Niall', 'Neil')
2916
    0.80000000000000004
2917
    >>> sim_editex('aluminum', 'Catalan')
2918
    0.25
2919
    >>> sim_editex('ATCG', 'TAGC')
2920
    0.25
2921
    """
2922
    return 1 - dist_editex(src, tar, cost, local)
2923
2924
2925
def eudex_hamming(src, tar, weights='exponential', maxlength=8, normalized=False):
2926
    """Calculate the Hamming distance between the Eudex hashes of two terms.
2927
2928
    If weights is set to None, a simple Hamming distance is calculated.
2929
    If weights is set to 'exponential', weight decays by powers of 2, as
2930
         proposed in the eudex specification: https://github.com/ticki/eudex.
2931
    If weights is set to 'fibonacci', weight decays through the Fibonacci
2932
         series, as in the eudex reference implementation.
2933
    If weights is set to a callable function, this assumes it creates a
2934
         generator and the generator is used to populate a series of weights.
2935
    If weights is set to an iterable, the iterable's values should be integers
2936
         and will be used as the weights.
2937
2938
    :param str src, tar: two strings to be compared
2939
    :param iterable or generator function weights:
2940
    :param maxlength: the number of characters to encode as a eudex hash
2941
    :return:
2942
    """
2943
2944
    def _gen_fibonacci():
2945
        """Yield the next Fibonacci number.
2946
2947
        Based on https://www.python-course.eu/generators.php
2948
        Starts at Fibonacci number 3 (the second 1)
2949
        """
2950
        a, b = 1, 2
0 ignored issues
show
Coding Style Naming introduced by
Variable name "a" doesn't conform to snake_case naming style ('(([a-z_][a-z0-9_]2,)|(_[a-z0-9_]*)|(__[a-z][a-z0-9_]+__))$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
Coding Style Naming introduced by
Variable name "b" doesn't conform to snake_case naming style ('(([a-z_][a-z0-9_]2,)|(_[a-z0-9_]*)|(__[a-z][a-z0-9_]+__))$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
2951
        while True:
2952
            yield a
2953
            a, b = b, a + b
0 ignored issues
show
Coding Style Naming introduced by
Variable name "a" doesn't conform to snake_case naming style ('(([a-z_][a-z0-9_]2,)|(_[a-z0-9_]*)|(__[a-z][a-z0-9_]+__))$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
Coding Style Naming introduced by
Variable name "b" doesn't conform to snake_case naming style ('(([a-z_][a-z0-9_]2,)|(_[a-z0-9_]*)|(__[a-z][a-z0-9_]+__))$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
2954
2955
    def _gen_exponential(base=2):
2956
        """Yield the next value in an exponential series of the base.
2957
2958
        Based on https://www.python-course.eu/generators.php
2959
        Starts at base**0
2960
        """
2961
        n = 0
0 ignored issues
show
Coding Style Naming introduced by
Variable name "n" doesn't conform to snake_case naming style ('(([a-z_][a-z0-9_]2,)|(_[a-z0-9_]*)|(__[a-z][a-z0-9_]+__))$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
2962
        while True:
2963
            yield base ** n
2964
            n += 1
0 ignored issues
show
Coding Style Naming introduced by
Variable name "n" doesn't conform to snake_case naming style ('(([a-z_][a-z0-9_]2,)|(_[a-z0-9_]*)|(__[a-z][a-z0-9_]+__))$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
2965
2966
    # Calculate the eudex hashes and XOR them
2967
    xored = eudex(src, maxlength=maxlength) ^ eudex(tar, maxlength=maxlength)
2968
2969
    # Simple hamming distance (all bits are equal)
2970
    if not weights:
2971
        return bin(xored).count('1')
2972
2973
    # If weights is a function, it should create a generator,
2974
    # which we now use to populate a list
2975
    if callable(weights):
2976
        weights = weights()
2977
    elif weights == 'exponential':
2978
        weights = _gen_exponential()
2979
    elif weights == 'fibonacci':
2980
        weights = _gen_fibonacci()
2981
    if isinstance(weights, types.GeneratorType):
2982
        weights = [next(weights) for _ in range(maxlength)][::-1]
2983
2984
    # Sum the weighted hamming distance
2985
    dist = 0
0 ignored issues
show
Comprehensibility Bug introduced by
dist is re-defining a name which is already available in the outer-scope (previously defined on line 3108).

It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior:

param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
Loading history...
2986
    maxdist = 0
2987
    while (xored or normalized) and weights:
2988
        maxdist += 8*weights[-1]
2989
        dist += bin(xored & 0xFF).count('1') * weights.pop()
2990
        xored >>= 8
2991
2992
    if normalized:
2993
        dist /= maxdist
2994
2995
    return dist
2996
2997
2998
def dist_eudex(src, tar, weights='exponential', maxlength=8):
2999
    """Calculate the normalized Hamming distance between the Eudex hashes of two terms.
3000
3001
    If weights is set to None, a simple Hamming distance is calculated.
3002
    If weights is set to 'exponential', weight decays by powers of 2, as
3003
         proposed in the eudex specification: https://github.com/ticki/eudex.
3004
    If weights is set to 'fibonacci', weight decays through the Fibonacci
3005
         series, as in the eudex reference implementation.
3006
    If weights is set to a callable function, this assumes it creates a
3007
         generator and the generator is used to populate a series of weights.
3008
    If weights is set to an iterable, the iterable's values should be integers
3009
         and will be used as the weights.
3010
3011
    :param str src, tar: two strings to be compared
3012
    :param iterable or generator function weights:
3013
    :param maxlength: the number of characters to encode as a eudex hash
3014
    :return:
3015
    """
3016
    return eudex_hamming(src, tar, weights, maxlength, True)
3017
3018
3019
def sim_eudex(src, tar, weights='exponential', maxlength=8):
3020
    """Calculate the normalized Hamming similarity between the Eudex hashes of two terms.
3021
3022
    If weights is set to None, a simple Hamming distance is calculated.
3023
    If weights is set to 'exponential', weight decays by powers of 2, as
3024
         proposed in the eudex specification: https://github.com/ticki/eudex.
3025
    If weights is set to 'fibonacci', weight decays through the Fibonacci
3026
         series, as in the eudex reference implementation.
3027
    If weights is set to a callable function, this assumes it creates a
3028
         generator and the generator is used to populate a series of weights.
3029
    If weights is set to an iterable, the iterable's values should be integers
3030
         and will be used as the weights.
3031
3032
    :param str src, tar: two strings to be compared
3033
    :param iterable or generator function weights:
3034
    :param maxlength: the number of characters to encode as a eudex hash
3035
    :return:
3036
    """
3037
    return 1-dist_eudex(src, tar, weights, maxlength)
3038
3039
3040
def sim_tfidf(src, tar, qval=2, docs_src=None, docs_tar=None):
0 ignored issues
show
Unused Code introduced by
The argument docs_tar seems to be unused.
Loading history...
3041
    """Return the TF-IDF similarity of two strings.
3042
3043
    TF-IDF similarity
3044
3045
    This is chiefly based on the "Formal Definition of TF/IDF Distance" at:
3046
    http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/TfIdfDistance.html
3047
3048
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
3049
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
3050
        version
3051
    :param Counter docs_src: a Counter object or string representing the
3052
        document corpus for the src string
3053
    :param Counter docs_tar: a Counter object or string representing the
3054
        document corpus for the tar string (or set to None to use the docs_src
3055
        for both)
3056
    :returns: TF-IDF similarity
3057
    :rtype: float
3058
    """
3059
    if src == tar:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
3060
        return 1.0  # TODO: confirm correctness of this when docs are different
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
3061
    elif not src or not tar:
3062
        return 0.0
3063
3064
    q_src, q_tar = _get_qgrams(src, tar, qval)
3065
3066
    if isinstance(docs_src, Counter):
3067
        q_docs = docs_src
0 ignored issues
show
Unused Code introduced by
The variable q_docs seems to be unused.
Loading history...
3068
    elif qval and qval > 0:
3069
        q_docs = QGrams(docs_src, qval)
3070
    else:
3071
        q_docs = Counter(docs_src.strip().split())
3072
3073
    if not q_src or not q_tar:
3074
        return 0.0
3075
3076
    # TODO: finish implementation
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
3077
    return 0.5  # hardcoded to half
3078
3079
###############################################################################
3080
3081
3082
def sim(src, tar, method=sim_levenshtein):
3083
    """Return a similarity of two strings.
3084
3085
    This is a generalized function for calling other similarity functions.
3086
3087
    :param str src, tar: two strings to be compared
3088
    :param function method: specifies the similarity metric (Levenshtein by
3089
        default)
3090
    :returns: similarity according to the specified function
3091
    :rtype: float
3092
3093
    >>> sim('cat', 'hat')
3094
    0.66666666666666674
3095
    >>> sim('Niall', 'Neil')
3096
    0.40000000000000002
3097
    >>> sim('aluminum', 'Catalan')
3098
    0.125
3099
    >>> sim('ATCG', 'TAGC')
3100
    0.25
3101
    """
3102
    if callable(method):
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
3103
        return method(src, tar)
3104
    else:
3105
        raise AttributeError('Unknown similarity function: ' + str(method))
3106
3107
3108
def dist(src, tar, method=sim_levenshtein):
3109
    """Return a distance between two strings.
3110
3111
    This is a generalized function for calling other distance functions.
3112
3113
    :param str src, tar: two strings to be compared
3114
    :param function method: specifies the similarity metric (Levenshtein by
3115
        default) -- Note that this takes a similarity metric function, not
3116
        a distance metric function.
3117
    :returns: distance according to the specified function
3118
    :rtype: float
3119
3120
    >>> dist('cat', 'hat')
3121
    0.33333333333333326
3122
    >>> dist('Niall', 'Neil')
3123
    0.59999999999999998
3124
    >>> dist('aluminum', 'Catalan')
3125
    0.875
3126
    >>> dist('ATCG', 'TAGC')
3127
    0.75
3128
    """
3129
    if callable(method):
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
3130
        return 1 - method(src, tar)
3131
    else:
3132
        raise AttributeError('Unknown distance function: ' + str(method))
3133