Test Failed
Push — master ( d1b33f...9f504a )
by Chris
15:58
created

abydos.distance.minkowski()   B

Complexity

Conditions 8

Size

Total Lines 38
Code Lines 19

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 8
eloc 19
nop 6
dl 0
loc 38
rs 7.3333
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
0 ignored issues
show
coding-style introduced by
Too many lines in module (3881/1000)
Loading history...
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.distance.
20
21
The distance module implements string edit distance functions including:
22
23
    - Levenshtein distance
24
    - Optimal String Alignment distance
25
    - Levenshtein-Damerau distance
26
    - Hamming distance
27
    - Tversky index
28
    - Sørensen–Dice coefficient & distance
29
    - Jaccard similarity coefficient & distance
30
    - overlap similarity & distance
31
    - Tanimoto coefficient & distance
32
    - Minkowski distance & similarity
33
    - Manhattan distance & similarity
34
    - Euclidean distance & similarity
35
    - Chebyshev distance & similarity
36
    - cosine similarity & distance
37
    - Jaro distance
38
    - Jaro-Winkler distance (incl. the strcmp95 algorithm variant)
39
    - Longest common substring
40
    - Ratcliff-Obershelp similarity & distance
41
    - Match Rating Algorithm similarity
42
    - Normalized Compression Distance (NCD) & similarity
43
    - Monge-Elkan similarity & distance
44
    - Matrix similarity
45
    - Needleman-Wunsch score
46
    - Smither-Waterman score
47
    - Gotoh score
48
    - Length similarity
49
    - Prefix, Suffix, and Identity similarity & distance
50
    - Modified Language-Independent Product Name Search (MLIPNS) similarity &
51
      distance
52
    - Bag distance
53
    - Editex distance
54
    - Eudex distances
55
    - Sift4 distance
56
    - Baystat distance & similarity
57
    - Typo distance
58
    - Indel distance
59
    - Synoname
60
61
Functions beginning with the prefixes 'sim' and 'dist' are guaranteed to be
62
in the range [0, 1], and sim_X = 1 - dist_X since the two are complements.
63
If a sim_X function is supplied identical src & tar arguments, it is guaranteed
64
to return 1; the corresponding dist_X function is guaranteed to return 0.
65
"""
66
67
from __future__ import division, unicode_literals
68
69
from codecs import encode
70
from collections import Counter, Iterable, defaultdict
71
from math import log, sqrt
72
from numbers import Number
73
from sys import maxsize, modules
74
from types import GeneratorType
75
from unicodedata import normalize
76
77
from numpy import float32 as np_float32
0 ignored issues
show
introduced by
Unable to import 'numpy'
Loading history...
78
from numpy import int as np_int
0 ignored issues
show
introduced by
Unable to import 'numpy'
Loading history...
79
from numpy import zeros as np_zeros
0 ignored issues
show
introduced by
Unable to import 'numpy'
Loading history...
80
81
from six import text_type
0 ignored issues
show
introduced by
third party import "from six import text_type" should be placed before "from numpy import float32 as np_float32"
Loading history...
82
from six.moves import range
0 ignored issues
show
introduced by
third party import "from six.moves import range" should be placed before "from numpy import float32 as np_float32"
Loading history...
83
84
from .compression import ac_encode, ac_train, rle_encode
85
from .fingerprint import _synoname_special_table, synoname_toolcode
86
from .phonetic import eudex, mra
87
from .qgram import QGrams
88
89
try:
90
    import lzma
91
except ImportError:  # pragma: no cover
92
    # If the system lacks the lzma library, that's fine, but lzma comrpession
93
    # similarity won't be supported.
94
    pass
95
96
__all__ = ['bag', 'chebyshev', 'damerau_levenshtein', 'dist', 'dist_bag',
97
           'dist_baystat', 'dist_chebyshev', 'dist_compression', 'dist_cosine',
98
           'dist_damerau', 'dist_dice', 'dist_editex', 'dist_euclidean',
99
           'dist_eudex', 'dist_hamming', 'dist_ident', 'dist_indel',
100
           'dist_jaccard', 'dist_jaro_winkler', 'dist_lcsseq', 'dist_lcsstr',
101
           'dist_length', 'dist_levenshtein', 'dist_manhattan',
102
           'dist_minkowski', 'dist_mlipns', 'dist_monge_elkan', 'dist_mra',
103
           'dist_overlap', 'dist_prefix', 'dist_ratcliff_obershelp',
104
           'dist_sift4', 'dist_strcmp95', 'dist_suffix', 'dist_tversky',
105
           'dist_typo', 'editex', 'euclidean', 'eudex', 'eudex_hamming',
106
           'gotoh', 'hamming', 'lcsseq', 'lcsstr', 'levenshtein', 'manhattan',
107
           'minkowski', 'mra_compare', 'needleman_wunsch', 'sift4_common',
108
           'sift4_simplest', 'sim', 'sim_bag', 'sim_baystat', 'sim_chebyshev',
109
           'sim_compression', 'sim_cosine', 'sim_damerau', 'sim_dice',
110
           'sim_editex', 'sim_euclidean', 'sim_eudex', 'sim_hamming',
111
           'sim_ident', 'sim_indel', 'sim_jaccard', 'sim_jaro_winkler',
112
           'sim_lcsseq', 'sim_lcsstr', 'sim_length', 'sim_levenshtein',
113
           'sim_manhattan', 'sim_matrix', 'sim_minkowski', 'sim_mlipns',
114
           'sim_monge_elkan', 'sim_mra', 'sim_overlap', 'sim_prefix',
115
           'sim_ratcliff_obershelp', 'sim_sift4', 'sim_strcmp95', 'sim_suffix',
116
           'sim_tanimoto', 'sim_tversky', 'sim_typo', 'smith_waterman',
117
           'synoname', 'synoname_word_approximation', 'tanimoto', 'typo']
118
119
120
def levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
121
    """Return the Levenshtein distance between two strings.
122
123
    This is the standard edit distance measure. Cf.
124
    :cite:`Levenshtein:1965,Levenshtein:1966`.
125
126
    Two additional variants: optimal string alignment (aka restricted
127
    Damerau-Levenshtein distance) :cite:`Boytsov:2011` and the
128
    Damerau-Levenshtein :cite:`Damerau:1964` distance are also supported.
129
130
    The ordinary Levenshtein & Optimal String Alignment distance both
131
    employ the Wagner-Fischer dynamic programming algorithm
132
    :cite:`Wagner:1974`.
133
134
    Levenshtein edit distance ordinarily has unit insertion, deletion, and
135
    substitution costs.
136
137
    :param str src, tar: two strings to be compared
138
    :param str mode: specifies a mode for computing the Levenshtein distance:
139
140
        - 'lev' (default) computes the ordinary Levenshtein distance,
141
          in which edits may include inserts, deletes, and substitutions
142
        - 'osa' computes the Optimal String Alignment distance, in which
143
          edits may include inserts, deletes, substitutions, and
144
          transpositions but substrings may only be edited once
145
        - 'dam' computes the Damerau-Levenshtein distance, in which
146
          edits may include inserts, deletes, substitutions, and
147
          transpositions and substrings may undergo repeated edits
148
149
    :param tuple cost: a 4-tuple representing the cost of the four possible
150
        edits: inserts, deletes, substitutions, and transpositions,
151
        respectively (by default: (1, 1, 1, 1))
152
    :returns: the Levenshtein distance between src & tar
153
    :rtype: int (may return a float if cost has float values)
154
155
    >>> levenshtein('cat', 'hat')
156
    1
157
    >>> levenshtein('Niall', 'Neil')
158
    3
159
    >>> levenshtein('aluminum', 'Catalan')
160
    7
161
    >>> levenshtein('ATCG', 'TAGC')
162
    3
163
164
    >>> levenshtein('ATCG', 'TAGC', mode='osa')
165
    2
166
    >>> levenshtein('ACTG', 'TAGC', mode='osa')
167
    4
168
169
    >>> levenshtein('ATCG', 'TAGC', mode='dam')
170
    2
171
    >>> levenshtein('ACTG', 'TAGC', mode='dam')
172
    3
173
    """
174
    ins_cost, del_cost, sub_cost, trans_cost = cost
175
176
    if src == tar:
177
        return 0
178
    if not src:
179
        return len(tar) * ins_cost
180
    if not tar:
181
        return len(src) * del_cost
182
183
    if 'dam' in mode:
184
        return damerau_levenshtein(src, tar, cost)
185
186
    d_mat = np_zeros((len(src)+1, len(tar)+1), dtype=np_int)
187
    for i in range(len(src)+1):
188
        d_mat[i, 0] = i * del_cost
189
    for j in range(len(tar)+1):
190
        d_mat[0, j] = j * ins_cost
191
192
    for i in range(len(src)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
193
        for j in range(len(tar)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
194
            d_mat[i+1, j+1] = min(
195
                d_mat[i+1, j] + ins_cost,  # ins
196
                d_mat[i, j+1] + del_cost,  # del
197
                d_mat[i, j] + (sub_cost if src[i] != tar[j] else 0)  # sub/==
198
            )
199
200
            if mode == 'osa':
201
                if ((i+1 > 1 and j+1 > 1 and src[i] == tar[j-1] and
202
                     src[i-1] == tar[j])):
203
                    # transposition
204
                    d_mat[i+1, j+1] = min(d_mat[i+1, j+1],
205
                                          d_mat[i-1, j-1] + trans_cost)
206
207
    return d_mat[len(src), len(tar)]
208
209
210
def dist_levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
211
    """Return the normalized Levenshtein distance between two strings.
212
213
    The Levenshtein distance is normalized by dividing the Levenshtein distance
214
    (calculated by any of the three supported methods) by the greater of
215
    the number of characters in src times the cost of a delete and
216
    the number of characters in tar times the cost of an insert.
217
    For the case in which all operations have :math:`cost = 1`, this is
218
    equivalent to the greater of the length of the two strings src & tar.
219
220
    :param str src, tar: two strings to be compared
221
    :param str mode: specifies a mode for computing the Levenshtein distance:
222
223
        - 'lev' (default) computes the ordinary Levenshtein distance,
224
          in which edits may include inserts, deletes, and substitutions
225
        - 'osa' computes the Optimal String Alignment distance, in which
226
          edits may include inserts, deletes, substitutions, and
227
          transpositions but substrings may only be edited once
228
        - 'dam' computes the Damerau-Levenshtein distance, in which
229
          edits may include inserts, deletes, substitutions, and
230
          transpositions and substrings may undergo repeated edits
231
232
    :param tuple cost: a 4-tuple representing the cost of the four possible
233
        edits: inserts, deletes, substitutions, and transpositions,
234
        respectively (by default: (1, 1, 1, 1))
235
    :returns: normalized Levenshtein distance
236
    :rtype: float
237
238
    >>> round(dist_levenshtein('cat', 'hat'), 12)
239
    0.333333333333
240
    >>> round(dist_levenshtein('Niall', 'Neil'), 12)
241
    0.6
242
    >>> dist_levenshtein('aluminum', 'Catalan')
243
    0.875
244
    >>> dist_levenshtein('ATCG', 'TAGC')
245
    0.75
246
    """
247
    if src == tar:
248
        return 0
249
    ins_cost, del_cost = cost[:2]
250
    return (levenshtein(src, tar, mode, cost) /
251
            (max(len(src)*del_cost, len(tar)*ins_cost)))
252
253
254
def sim_levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
255
    """Return the Levenshtein similarity of two strings.
256
257
    Normalized Levenshtein similarity is the complement of normalized
258
    Levenshtein distance:
259
    :math:`sim_{Levenshtein} = 1 - dist_{Levenshtein}`.
260
261
    :param str src, tar: two strings to be compared
262
    :param str mode: specifies a mode for computing the Levenshtein distance:
263
264
            - 'lev' (default) computes the ordinary Levenshtein distance,
265
              in which edits may include inserts, deletes, and substitutions
266
            - 'osa' computes the Optimal String Alignment distance, in which
267
              edits may include inserts, deletes, substitutions, and
268
              transpositions but substrings may only be edited once
269
            - 'dam' computes the Damerau-Levenshtein distance, in which
270
              edits may include inserts, deletes, substitutions, and
271
              transpositions and substrings may undergo repeated edits
272
273
    :param tuple cost: a 4-tuple representing the cost of the four possible
274
        edits:
275
        inserts, deletes, substitutions, and transpositions, respectively
276
        (by default: (1, 1, 1, 1))
277
    :returns: normalized Levenshtein similarity
278
    :rtype: float
279
280
    >>> round(sim_levenshtein('cat', 'hat'), 12)
281
    0.666666666667
282
    >>> round(sim_levenshtein('Niall', 'Neil'), 12)
283
    0.4
284
    >>> sim_levenshtein('aluminum', 'Catalan')
285
    0.125
286
    >>> sim_levenshtein('ATCG', 'TAGC')
287
    0.25
288
    """
289
    return 1 - dist_levenshtein(src, tar, mode, cost)
290
291
292
def damerau_levenshtein(src, tar, cost=(1, 1, 1, 1)):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (20/15).
Loading history...
293
    """Return the Damerau-Levenshtein distance between two strings.
294
295
    This computes the Damerau-Levenshtein distance :cite:`Damerau:1964`.
296
    Damerau-Levenshtein code is based on Java code by Kevin L. Stern
297
    :cite:`Stern:2014`, under the MIT license:
298
    https://github.com/KevinStern/software-and-algorithms/blob/master/src/main/java/blogspot/software_and_algorithms/stern_library/string/DamerauLevenshteinAlgorithm.java
299
300
    :param str src, tar: two strings to be compared
301
    :param tuple cost: a 4-tuple representing the cost of the four possible
302
        edits:
303
        inserts, deletes, substitutions, and transpositions, respectively
304
        (by default: (1, 1, 1, 1))
305
    :returns: the Damerau-Levenshtein distance between src & tar
306
    :rtype: int (may return a float if cost has float values)
307
308
    >>> damerau_levenshtein('cat', 'hat')
309
    1
310
    >>> damerau_levenshtein('Niall', 'Neil')
311
    3
312
    >>> damerau_levenshtein('aluminum', 'Catalan')
313
    7
314
    >>> damerau_levenshtein('ATCG', 'TAGC')
315
    2
316
    """
317
    ins_cost, del_cost, sub_cost, trans_cost = cost
318
319
    if src == tar:
320
        return 0
321
    if not src:
322
        return len(tar) * ins_cost
323
    if not tar:
324
        return len(src) * del_cost
325
326
    if 2*trans_cost < ins_cost + del_cost:
327
        raise ValueError('Unsupported cost assignment; the cost of two ' +
328
                         'transpositions must not be less than the cost of ' +
329
                         'an insert plus a delete.')
330
331
    d_mat = (np_zeros((len(src))*(len(tar)), dtype=np_int).
332
             reshape((len(src), len(tar))))
333
334
    if src[0] != tar[0]:
335
        d_mat[0, 0] = min(sub_cost, ins_cost + del_cost)
336
337
    src_index_by_character = {}
338
    src_index_by_character[src[0]] = 0
339
    for i in range(1, len(src)):
340
        del_distance = d_mat[i-1, 0] + del_cost
341
        ins_distance = (i+1) * del_cost + ins_cost
342
        match_distance = (i * del_cost +
343
                          (0 if src[i] == tar[0] else sub_cost))
344
        d_mat[i, 0] = min(del_distance, ins_distance, match_distance)
345
346
    for j in range(1, len(tar)):
347
        del_distance = (j+1) * ins_cost + del_cost
348
        ins_distance = d_mat[0, j-1] + ins_cost
349
        match_distance = (j * ins_cost +
350
                          (0 if src[0] == tar[j] else sub_cost))
351
        d_mat[0, j] = min(del_distance, ins_distance, match_distance)
352
353
    for i in range(1, len(src)):
354
        max_src_letter_match_index = (0 if src[i] == tar[0] else -1)
355
        for j in range(1, len(tar)):
356
            candidate_swap_index = (-1 if tar[j] not in
357
                                    src_index_by_character else
358
                                    src_index_by_character[tar[j]])
359
            j_swap = max_src_letter_match_index
360
            del_distance = d_mat[i-1, j] + del_cost
361
            ins_distance = d_mat[i, j-1] + ins_cost
362
            match_distance = d_mat[i-1, j-1]
363
            if src[i] != tar[j]:
364
                match_distance += sub_cost
365
            else:
366
                max_src_letter_match_index = j
367
368
            if candidate_swap_index != -1 and j_swap != -1:
369
                i_swap = candidate_swap_index
370
371
                if i_swap == 0 and j_swap == 0:
372
                    pre_swap_cost = 0
373
                else:
374
                    pre_swap_cost = d_mat[max(0, i_swap-1), max(0, j_swap-1)]
375
                swap_distance = (pre_swap_cost + (i - i_swap - 1) *
376
                                 del_cost + (j - j_swap - 1) * ins_cost +
377
                                 trans_cost)
378
            else:
379
                swap_distance = maxsize
380
381
            d_mat[i, j] = min(del_distance, ins_distance,
382
                              match_distance, swap_distance)
383
        src_index_by_character[src[i]] = i
384
385
    return d_mat[len(src)-1, len(tar)-1]
386
387
388
def dist_damerau(src, tar, cost=(1, 1, 1, 1)):
389
    """Return the Damerau-Levenshtein similarity of two strings.
390
391
    Damerau-Levenshtein distance normalized to the interval [0, 1].
392
393
    The Damerau-Levenshtein distance is normalized by dividing the
394
    Damerau-Levenshtein distance by the greater of
395
    the number of characters in src times the cost of a delete and
396
    the number of characters in tar times the cost of an insert.
397
    For the case in which all operations have :math:`cost = 1`, this is
398
    equivalent to the greater of the length of the two strings src & tar.
399
400
    The arguments are identical to those of the levenshtein() function.
401
402
    :param str src, tar: two strings to be compared
403
    :param tuple cost: a 4-tuple representing the cost of the four possible
404
        edits:
405
        inserts, deletes, substitutions, and transpositions, respectively
406
        (by default: (1, 1, 1, 1))
407
    :returns: normalized Damerau-Levenshtein distance
408
    :rtype: float
409
410
    >>> round(dist_damerau('cat', 'hat'), 12)
411
    0.333333333333
412
    >>> round(dist_damerau('Niall', 'Neil'), 12)
413
    0.6
414
    >>> dist_damerau('aluminum', 'Catalan')
415
    0.875
416
    >>> dist_damerau('ATCG', 'TAGC')
417
    0.5
418
    """
419
    if src == tar:
420
        return 0
421
    ins_cost, del_cost = cost[:2]
422
    return (damerau_levenshtein(src, tar, cost) /
423
            (max(len(src)*del_cost, len(tar)*ins_cost)))
424
425
426
def sim_damerau(src, tar, cost=(1, 1, 1, 1)):
427
    """Return the Damerau-Levenshtein similarity of two strings.
428
429
    Normalized Damerau-Levenshtein similarity the complement of normalized
430
    Damerau-Levenshtein distance:
431
    :math:`sim_{Damerau} = 1 - dist_{Damerau}`.
432
433
    The arguments are identical to those of the levenshtein() function.
434
435
    :param str src, tar: two strings to be compared
436
    :param tuple cost: a 4-tuple representing the cost of the four possible
437
        edits:
438
        inserts, deletes, substitutions, and transpositions, respectively
439
        (by default: (1, 1, 1, 1))
440
    :returns: normalized Damerau-Levenshtein similarity
441
    :rtype: float
442
443
    >>> round(sim_damerau('cat', 'hat'), 12)
444
    0.666666666667
445
    >>> round(sim_damerau('Niall', 'Neil'), 12)
446
    0.4
447
    >>> sim_damerau('aluminum', 'Catalan')
448
    0.125
449
    >>> sim_damerau('ATCG', 'TAGC')
450
    0.5
451
    """
452
    return 1 - dist_damerau(src, tar, cost)
453
454
455
def hamming(src, tar, difflens=True):
456
    """Return the Hamming distance between two strings.
457
458
    Hamming distance :cite:`Hamming:1950` equals the number of character
459
    positions at which two strings differ. For strings of unequal lengths,
460
    it is not normally defined. By default, this implementation calculates the
461
    Hamming distance of the first n characters where n is the lesser of the two
462
    strings' lengths and adds to this the difference in string lengths.
463
464
    :param str src, tar: two strings to be compared
465
    :param bool allow_different_lengths:
466
        If True (default), this returns the Hamming distance for those
467
        characters that have a matching character in both strings plus the
468
        difference in the strings' lengths. This is equivalent to extending
469
        the shorter string with obligatorily non-matching characters.
470
        If False, an exception is raised in the case of strings of unequal
471
        lengths.
472
    :returns: the Hamming distance between src & tar
473
    :rtype: int
474
475
    >>> hamming('cat', 'hat')
476
    1
477
    >>> hamming('Niall', 'Neil')
478
    3
479
    >>> hamming('aluminum', 'Catalan')
480
    8
481
    >>> hamming('ATCG', 'TAGC')
482
    4
483
    """
484
    if not difflens and len(src) != len(tar):
485
        raise ValueError('Undefined for sequences of unequal length; set ' +
486
                         'difflens to True for Hamming distance between ' +
487
                         'strings of unequal lengths.')
488
489
    hdist = 0
490
    if difflens:
491
        hdist += abs(len(src)-len(tar))
492
    hdist += sum(c1 != c2 for c1, c2 in zip(src, tar))
493
494
    return hdist
495
496
497
def dist_hamming(src, tar, difflens=True):
498
    """Return the normalized Hamming distance between two strings.
499
500
    Hamming distance normalized to the interval [0, 1].
501
502
    The Hamming distance is normalized by dividing it
503
    by the greater of the number of characters in src & tar (unless difflens is
504
    set to False, in which case an exception is raised).
505
506
    The arguments are identical to those of the hamming() function.
507
508
    :param str src, tar: two strings to be compared
509
    :param bool allow_different_lengths:
510
        If True (default), this returns the Hamming distance for those
511
        characters that have a matching character in both strings plus the
512
        difference in the strings' lengths. This is equivalent to extending
513
        the shorter string with obligatorily non-matching characters.
514
        If False, an exception is raised in the case of strings of unequal
515
        lengths.
516
    :returns: normalized Hamming distance
517
    :rtype: float
518
519
    >>> round(dist_hamming('cat', 'hat'), 12)
520
    0.333333333333
521
    >>> dist_hamming('Niall', 'Neil')
522
    0.6
523
    >>> dist_hamming('aluminum', 'Catalan')
524
    1.0
525
    >>> dist_hamming('ATCG', 'TAGC')
526
    1.0
527
    """
528
    if src == tar:
529
        return 0
530
    return hamming(src, tar, difflens) / max(len(src), len(tar))
531
532
533
def sim_hamming(src, tar, difflens=True):
534
    """Return the normalized Hamming similarity of two strings.
535
536
    Hamming similarity normalized to the interval [0, 1].
537
538
    Hamming similarity is the complement of normalized Hamming distance:
539
    :math:`sim_{Hamming} = 1 - dist{Hamming}`.
540
541
    Provided that difflens==True, the Hamming similarity is identical to the
542
    Language-Independent Product Name Search (LIPNS) similarity score. For
543
    further information, see the sim_mlipns documentation.
544
545
    The arguments are identical to those of the hamming() function.
546
547
    :param str src, tar: two strings to be compared
548
    :param bool allow_different_lengths:
549
        If True (default), this returns the Hamming distance for those
550
        characters that have a matching character in both strings plus the
551
        difference in the strings' lengths. This is equivalent to extending
552
        the shorter string with obligatorily non-matching characters.
553
        If False, an exception is raised in the case of strings of unequal
554
        lengths.
555
    :returns: normalized Hamming similarity
556
    :rtype: float
557
558
    >>> round(sim_hamming('cat', 'hat'), 12)
559
    0.666666666667
560
    >>> sim_hamming('Niall', 'Neil')
561
    0.4
562
    >>> sim_hamming('aluminum', 'Catalan')
563
    0.0
564
    >>> sim_hamming('ATCG', 'TAGC')
565
    0.0
566
    """
567
    return 1 - dist_hamming(src, tar, difflens)
568
569
570
def _get_qgrams(src, tar, qval=0, skip=0):
571
    """Return the Q-Grams in src & tar.
572
573
    :param str src, tar: two strings to be compared
574
        (or QGrams/Counter objects)
575
    :param int qval: the length of each q-gram; 0 for non-q-gram version
576
    :param int skip: the number of characters to skip (only works when
577
        src and tar are strings
578
    :return: Q-Grams
579
    """
580
    if isinstance(src, Counter) and isinstance(tar, Counter):
581
        return src, tar
582
    if qval > 0:
583
        return (QGrams(src, qval, '$#', skip),
584
                QGrams(tar, qval, '$#', skip))
585
    return Counter(src.strip().split()), Counter(tar.strip().split())
586
587
588
def sim_tversky(src, tar, qval=2, alpha=1, beta=1, bias=None):
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
589
    r"""Return the Tversky index of two strings.
590
591
    The Tversky index :cite:`Tversky:1977` is defined as:
592
    For two sets X and Y:
593
    :math:`sim_{Tversky}(X, Y) = \\frac{|X \\cap Y|}
594
    {|X \\cap Y| + \\alpha|X - Y| + \\beta|Y - X|}`.
595
596
    :math:`\\alpha = \\beta = 1` is equivalent to the Jaccard & Tanimoto
597
    similarity coefficients.
598
599
    :math:`\\alpha = \\beta = 0.5` is equivalent to the Sørensen-Dice
600
    similarity coefficient :cite:`Dice:1945,Sorensen:1948`.
601
602
    Unequal α and β will tend to emphasize one or the other set's
603
    contributions:
604
605
        - :math:`\\alpha > \\beta` emphasizes the contributions of X over Y
606
        - :math:`\\alpha < \\beta` emphasizes the contributions of Y over X)
607
608
    Parameter values' relation to 1 emphasizes different types of
609
    contributions:
610
611
        - :math:`\\alpha and \\beta > 1` emphsize unique contributions over the
612
          intersection
613
        - :math:`\\alpha and \\beta < 1` emphsize the intersection over unique
614
          contributions
615
616
    The symmetric variant is defined in :cite:`Jiminez:2013`. This is activated
617
    by specifying a bias parameter.
618
619
    :param str src, tar: two strings to be compared
620
        (or QGrams/Counter objects)
621
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
622
        version
623
    :param float alpha, beta: two Tversky index parameters as indicated in the
624
        description below
625
    :returns: Tversky similarity
626
    :rtype: float
627
628
    >>> sim_tversky('cat', 'hat')
629
    0.3333333333333333
630
    >>> sim_tversky('Niall', 'Neil')
631
    0.2222222222222222
632
    >>> sim_tversky('aluminum', 'Catalan')
633
    0.0625
634
    >>> sim_tversky('ATCG', 'TAGC')
635
    0.0
636
    """
637
    if alpha < 0 or beta < 0:
638
        raise ValueError('Unsupported weight assignment; alpha and beta ' +
639
                         'must be greater than or equal to 0.')
640
641
    if src == tar:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
642
        return 1.0
643
    elif not src or not tar:
644
        return 0.0
645
646
    q_src, q_tar = _get_qgrams(src, tar, qval)
647
    q_src_mag = sum(q_src.values())
648
    q_tar_mag = sum(q_tar.values())
649
    q_intersection_mag = sum((q_src & q_tar).values())
650
651
    if not q_src or not q_tar:
652
        return 0.0
653
654
    if bias is None:
655
        return q_intersection_mag / (q_intersection_mag + alpha *
656
                                     (q_src_mag - q_intersection_mag) +
657
                                     beta * (q_tar_mag - q_intersection_mag))
658
659
    a_val = min(q_src_mag - q_intersection_mag,
660
                q_tar_mag - q_intersection_mag)
661
    b_val = max(q_src_mag - q_intersection_mag,
662
                q_tar_mag - q_intersection_mag)
663
    c_val = q_intersection_mag + bias
664
    return c_val / (beta * (alpha * a_val + (1 - alpha) * b_val) + c_val)
665
666
667
def dist_tversky(src, tar, qval=2, alpha=1, beta=1, bias=None):
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
668
    """Return the Tverssky distance between two strings.
669
670
    Tversky distance is the complement of the Tvesrsky index (similarity):
671
    :math:`dist_{Tversky} = 1-sim_{Tversky}`.
672
673
    :param str src, tar: two strings to be compared
674
        (or QGrams/Counter objects)
675
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
676
        version
677
    :param float alpha, beta: two Tversky index parameters as indicated in the
678
        description below
679
    :returns: Tversky distance
680
    :rtype: float
681
682
    >>> dist_tversky('cat', 'hat')
683
    0.6666666666666667
684
    >>> dist_tversky('Niall', 'Neil')
685
    0.7777777777777778
686
    >>> dist_tversky('aluminum', 'Catalan')
687
    0.9375
688
    >>> dist_tversky('ATCG', 'TAGC')
689
    1.0
690
    """
691
    return 1 - sim_tversky(src, tar, qval, alpha, beta, bias)
692
693
694
def sim_dice(src, tar, qval=2):
695
    r"""Return the Sørensen–Dice coefficient of two strings.
696
697
    For two sets X and Y, the Sørensen–Dice coefficient
698
    :cite:`Dice:1945,Sorensen:1948` is
699
    :math:`sim_{dice}(X, Y) = \\frac{2 \\cdot |X \\cap Y|}{|X| + |Y|}`.
700
701
    This is identical to the Tanimoto similarity coefficient
702
    :cite:`Tanimoto:1958` and the Tversky index :cite:`Tversky:1977` for
703
    :math:`\\alpha = \\beta = 0.5`.
704
705
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
706
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
707
        version
708
    :returns: Sørensen–Dice similarity
709
    :rtype: float
710
711
    >>> sim_dice('cat', 'hat')
712
    0.5
713
    >>> sim_dice('Niall', 'Neil')
714
    0.36363636363636365
715
    >>> sim_dice('aluminum', 'Catalan')
716
    0.11764705882352941
717
    >>> sim_dice('ATCG', 'TAGC')
718
    0.0
719
    """
720
    return sim_tversky(src, tar, qval, 0.5, 0.5)
721
722
723
def dist_dice(src, tar, qval=2):
724
    """Return the Sørensen–Dice distance between two strings.
725
726
    Sørensen–Dice distance is the complemenjt of the Sørensen–Dice coefficient:
727
    :math:`dist_{dice} = 1 - sim_{dice}`.
728
729
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
730
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
731
        version
732
    :returns: Sørensen–Dice distance
733
    :rtype: float
734
735
    >>> dist_dice('cat', 'hat')
736
    0.5
737
    >>> dist_dice('Niall', 'Neil')
738
    0.6363636363636364
739
    >>> dist_dice('aluminum', 'Catalan')
740
    0.8823529411764706
741
    >>> dist_dice('ATCG', 'TAGC')
742
    1.0
743
    """
744
    return 1 - sim_dice(src, tar, qval)
745
746
747
def sim_jaccard(src, tar, qval=2):
748
    r"""Return the Jaccard similarity of two strings.
749
750
    For two sets X and Y, the Jaccard similarity coefficient
751
    :cite:`Jaccard:1901` is :math:`sim_{jaccard}(X, Y) =
752
    \\frac{|X \\cap Y|}{|X \\cup Y|}`.
753
754
    This is identical to the Tanimoto similarity coefficient
755
    :cite:`Tanimoto:1958`
756
    and the Tversky index :cite:`Tversky:1977` for
757
    :math:`\\alpha = \\beta = 1`.
758
759
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
760
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
761
        version
762
    :returns: Jaccard similarity
763
    :rtype: float
764
765
    >>> sim_jaccard('cat', 'hat')
766
    0.3333333333333333
767
    >>> sim_jaccard('Niall', 'Neil')
768
    0.2222222222222222
769
    >>> sim_jaccard('aluminum', 'Catalan')
770
    0.0625
771
    >>> sim_jaccard('ATCG', 'TAGC')
772
    0.0
773
    """
774
    return sim_tversky(src, tar, qval, 1, 1)
775
776
777
def dist_jaccard(src, tar, qval=2):
778
    """Return the Jaccard distance between two strings.
779
780
    Jaccard distance is the complement of the Jaccard similarity coefficient:
781
    :math:`dist_{Jaccard} = 1 - sim_{Jaccard}`.
782
783
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
784
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
785
        version
786
    :returns: Jaccard distance
787
    :rtype: float
788
789
    >>> dist_jaccard('cat', 'hat')
790
    0.6666666666666667
791
    >>> dist_jaccard('Niall', 'Neil')
792
    0.7777777777777778
793
    >>> dist_jaccard('aluminum', 'Catalan')
794
    0.9375
795
    >>> dist_jaccard('ATCG', 'TAGC')
796
    1.0
797
    """
798
    return 1 - sim_jaccard(src, tar, qval)
799
800
801
def sim_overlap(src, tar, qval=2):
802
    r"""Return the overlap coefficient of two strings.
803
804
    For two sets X and Y, the overlap coefficient
805
    :cite:`Szymkiewicz:1934,Simpson:1949`, also called the
806
    Szymkiewicz-Simpson coefficient, is
807
    :math:`sim_{overlap}(X, Y) = \\frac{|X \\cap Y|}{min(|X|, |Y|)}`.
808
809
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
810
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
811
        version
812
    :returns: overlap similarity
813
    :rtype: float
814
815
    >>> sim_overlap('cat', 'hat')
816
    0.5
817
    >>> sim_overlap('Niall', 'Neil')
818
    0.4
819
    >>> sim_overlap('aluminum', 'Catalan')
820
    0.125
821
    >>> sim_overlap('ATCG', 'TAGC')
822
    0.0
823
    """
824
    if src == tar:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
825
        return 1.0
826
    elif not src or not tar:
827
        return 0.0
828
829
    q_src, q_tar = _get_qgrams(src, tar, qval)
830
    q_src_mag = sum(q_src.values())
831
    q_tar_mag = sum(q_tar.values())
832
    q_intersection_mag = sum((q_src & q_tar).values())
833
834
    return q_intersection_mag / min(q_src_mag, q_tar_mag)
835
836
837
def dist_overlap(src, tar, qval=2):
838
    """Return the overlap distance between two strings.
839
840
    Overlap distance is the complement of the overlap coefficient:
841
    :math:`sim_{overlap} = 1 - dist_{overlap}`.
842
843
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
844
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
845
        version
846
    :returns: overlap distance
847
    :rtype: float
848
849
    >>> dist_overlap('cat', 'hat')
850
    0.5
851
    >>> dist_overlap('Niall', 'Neil')
852
    0.6
853
    >>> dist_overlap('aluminum', 'Catalan')
854
    0.875
855
    >>> dist_overlap('ATCG', 'TAGC')
856
    1.0
857
    """
858
    return 1 - sim_overlap(src, tar, qval)
859
860
861
def sim_tanimoto(src, tar, qval=2):
862
    r"""Return the Tanimoto similarity of two strings.
863
864
    For two sets X and Y, the Tanimoto similarity coefficient
865
    :cite:`Tanimoto:1958` is
866
    :math:`sim_{Tanimoto}(X, Y) = \\frac{|X \\cap Y|}{|X \\cup Y|}`.
867
868
    This is identical to the Jaccard similarity coefficient
869
    :cite:`Jaccard:1901` and the Tversky index :cite:`Tversky:1977` for
870
    :math:`\\alpha = \\beta = 1`.
871
872
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
873
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
874
        version
875
    :returns: Tanimoto similarity
876
    :rtype: float
877
878
    >>> sim_tanimoto('cat', 'hat')
879
    0.3333333333333333
880
    >>> sim_tanimoto('Niall', 'Neil')
881
    0.2222222222222222
882
    >>> sim_tanimoto('aluminum', 'Catalan')
883
    0.0625
884
    >>> sim_tanimoto('ATCG', 'TAGC')
885
    0.0
886
    """
887
    return sim_jaccard(src, tar, qval)
888
889
890
def tanimoto(src, tar, qval=2):
891
    """Return the Tanimoto distance between two strings.
892
893
    Tanimoto distance is :math:`-log_{2}sim_{Tanimoto}`.
894
895
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
896
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
897
        version
898
    :returns: Tanimoto distance
899
    :rtype: float
900
901
    >>> tanimoto('cat', 'hat')
902
    -1.5849625007211563
903
    >>> tanimoto('Niall', 'Neil')
904
    -2.1699250014423126
905
    >>> tanimoto('aluminum', 'Catalan')
906
    -4.0
907
    >>> tanimoto('ATCG', 'TAGC')
908
    -inf
909
    """
910
    coeff = sim_jaccard(src, tar, qval)
911
    if coeff != 0:
912
        return log(coeff, 2)
913
914
    return float('-inf')
915
916
917
def minkowski(src, tar, qval=2, pval=1, normalize=False, alphabet=None):
0 ignored issues
show
Comprehensibility Bug introduced by
normalize is re-defining a name which is already available in the outer-scope (previously defined on line 75).

It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior:

param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
Loading history...
best-practice introduced by
Too many arguments (6/5)
Loading history...
918
    """Return the Minkowski distance (:math:`L^p-norm`) of two strings.
919
920
    The Minkowsky distance :cite:`Minkowski:1910` is a distance metric in
921
    :math:`L^p-space`.
922
923
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
924
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
925
        version
926
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
927
    :param normalize: normalizes to [0, 1] if True
928
    :param collection or int alphabet: the values or size of the alphabet
929
    :returns: the Minkowski distance
930
    :rtype: float
931
    """
932
    q_src, q_tar = _get_qgrams(src, tar, qval)
933
    diffs = ((q_src - q_tar) + (q_tar - q_src)).values()
934
935
    normalizer = 1
936
    if normalize:
937
        totals = (q_src + q_tar).values()
938
        if alphabet is not None:
939
            normalizer = (alphabet if isinstance(alphabet, Number) else
940
                          len(alphabet))
941
        elif pval == 0:
942
            normalizer = len(totals)
943
        else:
944
            normalizer = sum(_**pval for _ in totals)**(1 / pval)
945
946
    if len(diffs) == 0:
0 ignored issues
show
Unused Code introduced by
Do not use len(SEQUENCE) to determine if a sequence is empty
Loading history...
947
        return 0.0
948
    if pval == float('inf'):
949
        # Chebyshev distance
950
        return max(diffs)/normalizer
951
    if pval == 0:
952
        # This is the l_0 "norm" as developed by David Donoho
953
        return len(diffs)/normalizer
954
    return sum(_**pval for _ in diffs)**(1 / pval)/normalizer
955
956
957
def dist_minkowski(src, tar, qval=2, pval=1, alphabet=None):
958
    """Return Minkowski distance of two strings, normalized to [0, 1].
959
960
    The normalized Minkowsky distance :cite:`Minkowski:1910` is a distance
961
    metric in :math:`L^p-space`, normalized to [0, 1].
962
963
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
964
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
965
        version
966
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
967
    :param collection or int alphabet: the values or size of the alphabet
968
    :returns: the normalized Minkowski distance
969
    :rtype: float
970
    """
971
    return minkowski(src, tar, qval, pval, True, alphabet)
972
973
974
def sim_minkowski(src, tar, qval=2, pval=1, alphabet=None):
975
    """Return Minkowski similarity of two strings, normalized to [0, 1].
976
977
    Minkowski similarity is the complement of Minkowski distance:
978
    :math:`sim_{Minkowski} = 1 - dist_{Minkowski}`.
979
980
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
981
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
982
        version
983
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
984
    :param collection or int alphabet: the values or size of the alphabet
985
    :returns: the normalized Minkowski similarity
986
    :rtype: float
987
    """
988
    return 1-minkowski(src, tar, qval, pval, True, alphabet)
989
990
991
def manhattan(src, tar, qval=2, normalize=False, alphabet=None):
0 ignored issues
show
Comprehensibility Bug introduced by
normalize is re-defining a name which is already available in the outer-scope (previously defined on line 75).

It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior:

param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
Loading history...
992
    """Return the Manhattan distance between two strings.
993
994
    Manhattan distance is the city-block or taxi-cab distance, equivalent
995
    to Minkowski distance in :math:`L^1`-space.
996
997
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
998
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
999
        version
1000
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
1001
    :param normalize: normalizes to [0, 1] if True
1002
    :param collection or int alphabet: the values or size of the alphabet
1003
    :returns: the Manhattan distance
1004
    :rtype: float
1005
    """
1006
    return minkowski(src, tar, qval, 1, normalize, alphabet)
1007
1008
1009
def dist_manhattan(src, tar, qval=2, alphabet=None):
1010
    """Return the Manhattan distance between two strings, normalized to [0, 1].
1011
1012
    The normalized Manhattan distance is a distance
1013
    metric in :math:`L^1-space`, normalized to [0, 1].
1014
1015
    This is identical to Canberra distance.
1016
1017
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1018
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1019
        version
1020
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
1021
    :param collection or int alphabet: the values or size of the alphabet
1022
    :returns: the normalized Manhattan distance
1023
    :rtype: float
1024
    """
1025
    return manhattan(src, tar, qval, True, alphabet)
1026
1027
1028
def sim_manhattan(src, tar, qval=2, alphabet=None):
1029
    """Return the Manhattan similarity of two strings, normalized to [0, 1].
1030
1031
    Manhattan similarity is the complement of Manhattan distance:
1032
    :math:`sim_{Manhattan} = 1 - dist_{Manhattan}`.
1033
1034
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1035
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1036
        version
1037
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
1038
    :param collection or int alphabet: the values or size of the alphabet
1039
    :returns: the normalized Manhattan similarity
1040
    :rtype: float
1041
    """
1042
    return 1-manhattan(src, tar, qval, True, alphabet)
1043
1044
1045
def euclidean(src, tar, qval=2, normalize=False, alphabet=None):
0 ignored issues
show
Comprehensibility Bug introduced by
normalize is re-defining a name which is already available in the outer-scope (previously defined on line 75).

It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior:

param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
Loading history...
1046
    """Return the Euclidean distance between two strings.
1047
1048
    Euclidean distance is the straigh-line or as-the-crow-flies distance,
1049
    equivalent to Minkowski distance in :math:`L^2`-space.
1050
1051
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1052
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1053
        version
1054
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
1055
    :param normalize: normalizes to [0, 1] if True
1056
    :param collection or int alphabet: the values or size of the alphabet
1057
    :returns: the Euclidean distance
1058
    :rtype: float
1059
    """
1060
    return minkowski(src, tar, qval, 2, normalize, alphabet)
1061
1062
1063
def dist_euclidean(src, tar, qval=2, alphabet=None):
1064
    """Return the Euclidean distance between two strings, normalized to [0, 1].
1065
1066
    The normalized Euclidean distance is a distance
1067
    metric in :math:`L^2-space`, normalized to [0, 1].
1068
1069
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1070
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1071
        version
1072
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
1073
    :param collection or int alphabet: the values or size of the alphabet
1074
    :returns: the normalized Euclidean distance
1075
    :rtype: float
1076
    """
1077
    return euclidean(src, tar, qval, True, alphabet)
1078
1079
1080
def sim_euclidean(src, tar, qval=2, alphabet=None):
1081
    """Return the Euclidean similarity of two strings, normalized to [0, 1].
1082
1083
    Euclidean similarity is the complement of Euclidean distance:
1084
    :math:`sim_{Euclidean} = 1 - dist_{Euclidean}`.
1085
1086
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1087
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1088
        version
1089
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
1090
    :param collection or int alphabet: the values or size of the alphabet
1091
    :returns: the normalized Euclidean similarity
1092
    :rtype: float
1093
    """
1094
    return 1-euclidean(src, tar, qval, True, alphabet)
1095
1096
1097
def chebyshev(src, tar, qval=2, normalize=False, alphabet=None):
0 ignored issues
show
Comprehensibility Bug introduced by
normalize is re-defining a name which is already available in the outer-scope (previously defined on line 75).

It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior:

param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
Loading history...
1098
    r"""Return the Chebyshev distance between two strings.
1099
1100
    Euclidean distance is the chessboard distance,
1101
    equivalent to Minkowski distance in :math:`L^\infty`-space.
1102
1103
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1104
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1105
        version
1106
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
1107
    :param normalize: normalizes to [0, 1] if True
1108
    :param collection or int alphabet: the values or size of the alphabet
1109
    :returns: the Chebyshev distance
1110
    :rtype: float
1111
    """
1112
    return minkowski(src, tar, qval, float('inf'), normalize, alphabet)
1113
1114
1115
def dist_chebyshev(src, tar, qval=2, alphabet=None):
1116
    """Return the Chebyshev distance between two strings, normalized to [0, 1].
1117
1118
    The normalized Chebyshev distance :cite:`Minkowski:1910` is a distance
1119
    metric in :math:`L^p-space`, normalized to [0, 1].
1120
1121
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1122
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1123
        version
1124
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
1125
    :param collection or int alphabet: the values or size of the alphabet
1126
    :returns: the normalized Chebyshev distance
1127
    :rtype: float
1128
    """
1129
    return chebyshev(src, tar, qval, True, alphabet)
1130
1131
1132
def sim_chebyshev(src, tar, qval=2, alphabet=None):
1133
    """Return the Chebyshev similarity of two strings, normalized to [0, 1].
1134
1135
    Chebyshev similarity is the complement of Chebyshev distance:
1136
    :math:`sim_{Chebyshev} = 1 - dist_{Chebyshev}`.
1137
1138
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1139
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1140
        version
1141
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
1142
    :param collection or int alphabet: the values or size of the alphabet
1143
    :returns: the normalized Chebyshev similarity
1144
    :rtype: float
1145
    """
1146
    return 1 - chebyshev(src, tar, qval, True, alphabet)
1147
1148
1149
def sim_cosine(src, tar, qval=2):
1150
    r"""Return the cosine similarity of two strings.
1151
1152
    For two sets X and Y, the cosine similarity, Otsuka-Ochiai coefficient, or
1153
    Ochiai coefficient :cite:`Otsuka:1936,Ochiai:1957` is:
1154
    :math:`sim_{cosine}(X, Y) = \\frac{|X \\cap Y|}{\\sqrt{|X| \\cdot |Y|}}`.
1155
1156
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1157
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1158
        version
1159
    :returns: cosine similarity
1160
    :rtype: float
1161
1162
    >>> sim_cosine('cat', 'hat')
1163
    0.5
1164
    >>> sim_cosine('Niall', 'Neil')
1165
    0.3651483716701107
1166
    >>> sim_cosine('aluminum', 'Catalan')
1167
    0.11785113019775793
1168
    >>> sim_cosine('ATCG', 'TAGC')
1169
    0.0
1170
    """
1171
    if src == tar:
1172
        return 1.0
1173
    if not src or not tar:
1174
        return 0.0
1175
1176
    q_src, q_tar = _get_qgrams(src, tar, qval)
1177
    q_src_mag = sum(q_src.values())
1178
    q_tar_mag = sum(q_tar.values())
1179
    q_intersection_mag = sum((q_src & q_tar).values())
1180
1181
    return q_intersection_mag / sqrt(q_src_mag * q_tar_mag)
1182
1183
1184
def dist_cosine(src, tar, qval=2):
1185
    """Return the cosine distance between two strings.
1186
1187
    Cosine distance is the complement of cosine similarity:
1188
    :math:`dist_{cosine} = 1 - sim_{cosine}`.
1189
1190
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1191
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1192
        version
1193
    :returns: cosine distance
1194
    :rtype: float
1195
1196
    >>> dist_cosine('cat', 'hat')
1197
    0.5
1198
    >>> dist_cosine('Niall', 'Neil')
1199
    0.6348516283298893
1200
    >>> dist_cosine('aluminum', 'Catalan')
1201
    0.882148869802242
1202
    >>> dist_cosine('ATCG', 'TAGC')
1203
    1.0
1204
    """
1205
    return 1 - sim_cosine(src, tar, qval)
1206
1207
1208
def sim_strcmp95(src, tar, long_strings=False):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (23/15).
Loading history...
1209
    """Return the strcmp95 similarity of two strings.
1210
1211
    This is a Python translation of the C code for strcmp95:
1212
    http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
1213
    :cite:`Winkler:1994`.
1214
    The above file is a US Government publication and, accordingly,
1215
    in the public domain.
1216
1217
    This is based on the Jaro-Winkler distance, but also attempts to correct
1218
    for some common typos and frequently confused characters. It is also
1219
    limited to uppercase ASCII characters, so it is appropriate to American
1220
    names, but not much else.
1221
1222
    :param str src, tar: two strings to be compared
1223
    :param bool long_strings: set to True to "Increase the probability of a
1224
        match when the number of matched characters is large.  This option
1225
        allows for a little more tolerance when the strings are large. It is
1226
        not an appropriate test when comparing fixed length fields such as
1227
        phone and social security numbers."
1228
    :returns: strcmp95 similarity
1229
    :rtype: float
1230
1231
    >>> sim_strcmp95('cat', 'hat')
1232
    0.7777777777777777
1233
    >>> sim_strcmp95('Niall', 'Neil')
1234
    0.8454999999999999
1235
    >>> sim_strcmp95('aluminum', 'Catalan')
1236
    0.6547619047619048
1237
    >>> sim_strcmp95('ATCG', 'TAGC')
1238
    0.8333333333333334
1239
    """
1240
    def _inrange(char):
1241
        """Return True if char is in the range (0, 91)."""
1242
        return ord(char) > 0 and ord(char) < 91
1243
1244
    ying = src.strip().upper()
1245
    yang = tar.strip().upper()
1246
1247
    if ying == yang:
1248
        return 1.0
1249
    # If either string is blank - return - added in Version 2
1250
    if not ying or not yang:
1251
        return 0.0
1252
1253
    adjwt = defaultdict(int)
1254
    sp_mx = (
1255
        ('A', 'E'), ('A', 'I'), ('A', 'O'), ('A', 'U'), ('B', 'V'), ('E', 'I'),
1256
        ('E', 'O'), ('E', 'U'), ('I', 'O'), ('I', 'U'), ('O', 'U'), ('I', 'Y'),
1257
        ('E', 'Y'), ('C', 'G'), ('E', 'F'), ('W', 'U'), ('W', 'V'), ('X', 'K'),
1258
        ('S', 'Z'), ('X', 'S'), ('Q', 'C'), ('U', 'V'), ('M', 'N'), ('L', 'I'),
1259
        ('Q', 'O'), ('P', 'R'), ('I', 'J'), ('2', 'Z'), ('5', 'S'), ('8', 'B'),
1260
        ('1', 'I'), ('1', 'L'), ('0', 'O'), ('0', 'Q'), ('C', 'K'), ('G', 'J')
1261
    )
1262
1263
    # Initialize the adjwt array on the first call to the function only.
1264
    # The adjwt array is used to give partial credit for characters that
1265
    # may be errors due to known phonetic or character recognition errors.
1266
    # A typical example is to match the letter "O" with the number "0"
1267
    for i in sp_mx:
1268
        adjwt[(i[0], i[1])] = 3
1269
        adjwt[(i[1], i[0])] = 3
1270
1271
    if len(ying) > len(yang):
1272
        search_range = len(ying)
1273
        minv = len(yang)
1274
    else:
1275
        search_range = len(yang)
1276
        minv = len(ying)
1277
1278
    # Blank out the flags
1279
    ying_flag = [0] * search_range
1280
    yang_flag = [0] * search_range
1281
    search_range = max(0, search_range // 2 - 1)
1282
1283
    # Looking only within the search range, count and flag the matched pairs.
1284
    num_com = 0
1285
    yl1 = len(yang) - 1
1286
    for i in range(len(ying)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
1287
        lowlim = (i - search_range) if (i >= search_range) else 0
1288
        hilim = (i + search_range) if ((i + search_range) <= yl1) else yl1
1289
        for j in range(lowlim, hilim+1):
1290
            if (yang_flag[j] == 0) and (yang[j] == ying[i]):
1291
                yang_flag[j] = 1
1292
                ying_flag[i] = 1
1293
                num_com += 1
1294
                break
1295
1296
    # If no characters in common - return
1297
    if num_com == 0:
1298
        return 0.0
1299
1300
    # Count the number of transpositions
1301
    k = n_trans = 0
1302
    for i in range(len(ying)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
1303
        if ying_flag[i] != 0:
1304
            for j in range(k, len(yang)):
1305
                if yang_flag[j] != 0:
1306
                    k = j + 1
1307
                    break
1308
            if ying[i] != yang[j]:
0 ignored issues
show
introduced by
The variable j does not seem to be defined for all execution paths.
Loading history...
1309
                n_trans += 1
1310
    n_trans = n_trans // 2
1311
1312
    # Adjust for similarities in unmatched characters
1313
    n_simi = 0
1314
    if minv > num_com:
0 ignored issues
show
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
1315
        for i in range(len(ying)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
1316
            if ying_flag[i] == 0 and _inrange(ying[i]):
1317
                for j in range(len(yang)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
1318
                    if yang_flag[j] == 0 and _inrange(yang[j]):
1319
                        if (ying[i], yang[j]) in adjwt:
1320
                            n_simi += adjwt[(ying[i], yang[j])]
1321
                            yang_flag[j] = 2
1322
                            break
1323
    num_sim = n_simi/10.0 + num_com
1324
1325
    # Main weight computation
1326
    weight = num_sim / len(ying) + num_sim / len(yang) + \
1327
        (num_com - n_trans) / num_com
1328
    weight = weight / 3.0
1329
1330
    # Continue to boost the weight if the strings are similar
1331
    if weight > 0.7:
1332
1333
        # Adjust for having up to the first 4 characters in common
1334
        j = 4 if (minv >= 4) else minv
1335
        i = 0
1336
        while (i < j) and (ying[i] == yang[i]) and (not ying[i].isdigit()):
1337
            i += 1
1338
        if i:
1339
            weight += i * 0.1 * (1.0 - weight)
1340
1341
        # Optionally adjust for long strings.
1342
1343
        # After agreeing beginning chars, at least two more must agree and
1344
        # the agreeing characters must be > .5 of remaining characters.
1345
        if (((long_strings) and (minv > 4) and (num_com > i+1) and
1346
             (2*num_com >= minv+i))):
1347
            if not ying[0].isdigit():
1348
                weight += (1.0-weight) * ((num_com-i-1) /
1349
                                          (len(ying)+len(yang)-i*2+2))
1350
1351
    return weight
1352
1353
1354
def dist_strcmp95(src, tar, long_strings=False):
1355
    """Return the strcmp95 distance between two strings.
1356
1357
    strcmp95 distance is the complement of strcmp95 similarity:
1358
    :math:`dist_{strcmp95} = 1 - sim_{strcmp95}`.
1359
1360
    :param str src, tar: two strings to be compared
1361
    :param bool long_strings: set to True to "Increase the probability of a
1362
        match when the number of matched characters is large.  This option
1363
        allows for a little more tolerance when the strings are large. It is
1364
        not an appropriate test when comparing fixed length fields such as
1365
        phone and social security numbers."
1366
    :returns: strcmp95 distance
1367
    :rtype: float
1368
1369
    >>> round(dist_strcmp95('cat', 'hat'), 12)
1370
    0.222222222222
1371
    >>> round(dist_strcmp95('Niall', 'Neil'), 12)
1372
    0.1545
1373
    >>> round(dist_strcmp95('aluminum', 'Catalan'), 12)
1374
    0.345238095238
1375
    >>> round(dist_strcmp95('ATCG', 'TAGC'), 12)
1376
    0.166666666667
1377
    """
1378
    return 1 - sim_strcmp95(src, tar, long_strings)
1379
1380
1381
def sim_jaro_winkler(src, tar, qval=1, mode='winkler', long_strings=False,
0 ignored issues
show
best-practice introduced by
Too many arguments (7/5)
Loading history...
Comprehensibility introduced by
This function exceeds the maximum number of variables (22/15).
Loading history...
1382
                     boost_threshold=0.7, scaling_factor=0.1):
1383
    """Return the Jaro or Jaro-Winkler similarity of two strings.
1384
1385
    Jaro(-Winkler) distance is a string edit distance initially proposed by
1386
    Jaro and extended by Winkler :cite:`Jaro:1989,Winkler:1990`.
1387
1388
    This is Python based on the C code for strcmp95:
1389
    http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
1390
    :cite:`Winkler:1994`. The above file is a US Government publication and,
1391
    accordingly, in the public domain.
1392
1393
    :param str src, tar: two strings to be compared
1394
    :param int qval: the length of each q-gram (defaults to 1: character-wise
1395
        matching)
1396
    :param str mode: indicates which variant of this distance metric to
1397
        compute:
1398
1399
            - 'winkler' -- computes the Jaro-Winkler distance (default) which
1400
              increases the score for matches near the start of the word
1401
            - 'jaro' -- computes the Jaro distance
1402
1403
    The following arguments apply only when mode is 'winkler':
1404
1405
    :param bool long_strings: set to True to "Increase the probability of a
1406
        match when the number of matched characters is large.  This option
1407
        allows for a little more tolerance when the strings are large.  It is
1408
        not an appropriate test when comparing fixed length fields such as
1409
        phone and social security numbers."
1410
    :param float boost_threshold: a value between 0 and 1, below which the
1411
        Winkler boost is not applied (defaults to 0.7)
1412
    :param float scaling_factor: a value between 0 and 0.25, indicating by how
1413
        much to boost scores for matching prefixes (defaults to 0.1)
1414
1415
    :returns: Jaro or Jaro-Winkler similarity
1416
    :rtype: float
1417
1418
    >>> round(sim_jaro_winkler('cat', 'hat'), 12)
1419
    0.777777777778
1420
    >>> round(sim_jaro_winkler('Niall', 'Neil'), 12)
1421
    0.805
1422
    >>> round(sim_jaro_winkler('aluminum', 'Catalan'), 12)
1423
    0.60119047619
1424
    >>> round(sim_jaro_winkler('ATCG', 'TAGC'), 12)
1425
    0.833333333333
1426
1427
    >>> round(sim_jaro_winkler('cat', 'hat', mode='jaro'), 12)
1428
    0.777777777778
1429
    >>> round(sim_jaro_winkler('Niall', 'Neil', mode='jaro'), 12)
1430
    0.783333333333
1431
    >>> round(sim_jaro_winkler('aluminum', 'Catalan', mode='jaro'), 12)
1432
    0.60119047619
1433
    >>> round(sim_jaro_winkler('ATCG', 'TAGC', mode='jaro'), 12)
1434
    0.833333333333
1435
    """
1436
    if mode == 'winkler':
1437
        if boost_threshold > 1 or boost_threshold < 0:
1438
            raise ValueError('Unsupported boost_threshold assignment; ' +
1439
                             'boost_threshold must be between 0 and 1.')
1440
        if scaling_factor > 0.25 or scaling_factor < 0:
1441
            raise ValueError('Unsupported scaling_factor assignment; ' +
1442
                             'scaling_factor must be between 0 and 0.25.')
1443
1444
    if src == tar:
1445
        return 1.0
1446
1447
    src = QGrams(src.strip(), qval).ordered_list
1448
    tar = QGrams(tar.strip(), qval).ordered_list
1449
1450
    lens = len(src)
1451
    lent = len(tar)
1452
1453
    # If either string is blank - return - added in Version 2
1454
    if lens == 0 or lent == 0:
1455
        return 0.0
1456
1457
    if lens > lent:
1458
        search_range = lens
1459
        minv = lent
1460
    else:
1461
        search_range = lent
1462
        minv = lens
1463
1464
    # Zero out the flags
1465
    src_flag = [0] * search_range
1466
    tar_flag = [0] * search_range
1467
    search_range = max(0, search_range//2 - 1)
1468
1469
    # Looking only within the search range, count and flag the matched pairs.
1470
    num_com = 0
1471
    yl1 = lent - 1
1472
    for i in range(lens):
1473
        lowlim = (i - search_range) if (i >= search_range) else 0
1474
        hilim = (i + search_range) if ((i + search_range) <= yl1) else yl1
1475
        for j in range(lowlim, hilim+1):
1476
            if (tar_flag[j] == 0) and (tar[j] == src[i]):
1477
                tar_flag[j] = 1
1478
                src_flag[i] = 1
1479
                num_com += 1
1480
                break
1481
1482
    # If no characters in common - return
1483
    if num_com == 0:
1484
        return 0.0
1485
1486
    # Count the number of transpositions
1487
    k = n_trans = 0
1488
    for i in range(lens):
1489
        if src_flag[i] != 0:
1490
            for j in range(k, lent):
1491
                if tar_flag[j] != 0:
1492
                    k = j + 1
1493
                    break
1494
            if src[i] != tar[j]:
0 ignored issues
show
introduced by
The variable j does not seem to be defined for all execution paths.
Loading history...
1495
                n_trans += 1
1496
    n_trans = n_trans // 2
1497
1498
    # Main weight computation for Jaro distance
1499
    weight = num_com / lens + num_com / lent + (num_com - n_trans) / num_com
1500
    weight = weight / 3.0
1501
1502
    # Continue to boost the weight if the strings are similar
1503
    # This is the Winkler portion of Jaro-Winkler distance
1504
    if mode == 'winkler' and weight > boost_threshold:
1505
1506
        # Adjust for having up to the first 4 characters in common
1507
        j = 4 if (minv >= 4) else minv
1508
        i = 0
1509
        while (i < j) and (src[i] == tar[i]):
1510
            i += 1
1511
        if i:
1512
            weight += i * scaling_factor * (1.0 - weight)
1513
1514
        # Optionally adjust for long strings.
1515
1516
        # After agreeing beginning chars, at least two more must agree and
1517
        # the agreeing characters must be > .5 of remaining characters.
1518
        if (((long_strings) and (minv > 4) and (num_com > i+1) and
1519
             (2*num_com >= minv+i))):
1520
            weight += (1.0-weight) * ((num_com-i-1) / (lens+lent-i*2+2))
1521
1522
    return weight
1523
1524
1525
def dist_jaro_winkler(src, tar, qval=1, mode='winkler', long_strings=False,
0 ignored issues
show
best-practice introduced by
Too many arguments (7/5)
Loading history...
1526
                      boost_threshold=0.7, scaling_factor=0.1):
1527
    """Return the Jaro or Jaro-Winkler distance between two strings.
1528
1529
    Jaro(-Winkler) similarity is the complement of Jaro(-Winkler) distance:
1530
    :math:`sim_{Jaro(-Winkler)} = 1 - dist_{Jaro(-Winkler)}`.
1531
1532
    :param str src, tar: two strings to be compared
1533
    :param int qval: the length of each q-gram (defaults to 1: character-wise
1534
        matching)
1535
    :param str mode: indicates which variant of this distance metric to
1536
        compute:
1537
1538
            - 'winkler' -- computes the Jaro-Winkler distance (default) which
1539
              increases the score for matches near the start of the word
1540
            - 'jaro' -- computes the Jaro distance
1541
1542
    The following arguments apply only when mode is 'winkler':
1543
1544
    :param bool long_strings: set to True to "Increase the probability of a
1545
        match when the number of matched characters is large.  This option
1546
        allows for a little more tolerance when the strings are large.  It is
1547
        not an appropriate test when comparing fixed length fields such as
1548
        phone and social security numbers."
1549
    :param float boost_threshold: a value between 0 and 1, below which the
1550
        Winkler boost is not applied (defaults to 0.7)
1551
    :param float scaling_factor: a value between 0 and 0.25, indicating by how
1552
        much to boost scores for matching prefixes (defaults to 0.1)
1553
1554
    :returns: Jaro or Jaro-Winkler distance
1555
    :rtype: float
1556
1557
    >>> round(dist_jaro_winkler('cat', 'hat'), 12)
1558
    0.222222222222
1559
    >>> round(dist_jaro_winkler('Niall', 'Neil'), 12)
1560
    0.195
1561
    >>> round(dist_jaro_winkler('aluminum', 'Catalan'), 12)
1562
    0.39880952381
1563
    >>> round(dist_jaro_winkler('ATCG', 'TAGC'), 12)
1564
    0.166666666667
1565
1566
    >>> round(dist_jaro_winkler('cat', 'hat', mode='jaro'), 12)
1567
    0.222222222222
1568
    >>> round(dist_jaro_winkler('Niall', 'Neil', mode='jaro'), 12)
1569
    0.216666666667
1570
    >>> round(dist_jaro_winkler('aluminum', 'Catalan', mode='jaro'), 12)
1571
    0.39880952381
1572
    >>> round(dist_jaro_winkler('ATCG', 'TAGC', mode='jaro'), 12)
1573
    0.166666666667
1574
    """
1575
    return 1 - sim_jaro_winkler(src, tar, qval, mode, long_strings,
1576
                                boost_threshold, scaling_factor)
1577
1578
1579
def lcsseq(src, tar):
1580
    """Return the longest common subsequence of two strings.
1581
1582
    Longest common subsequence (LCSseq) is the longest subsequence of
1583
    characters that two strings have in common.
1584
1585
    Based on the dynamic programming algorithm from
1586
    http://rosettacode.org/wiki/Longest_common_subsequence#Dynamic_Programming_6
1587
    :cite:`rosettacode:2018b`. This is licensed GFDL 1.2.
1588
1589
    Modifications include:
1590
        conversion to a numpy array in place of a list of lists
1591
1592
    :param str src, tar: two strings to be compared
1593
    :returns: the longes common subsequence
1594
    :rtype: str
1595
1596
    >>> lcsseq('cat', 'hat')
1597
    'at'
1598
    >>> lcsseq('Niall', 'Neil')
1599
    'Nil'
1600
    >>> lcsseq('aluminum', 'Catalan')
1601
    'aln'
1602
    >>> lcsseq('ATCG', 'TAGC')
1603
    'AC'
1604
    """
1605
    lengths = np_zeros((len(src)+1, len(tar)+1), dtype=np_int)
1606
1607
    # row 0 and column 0 are initialized to 0 already
1608
    for i, src_char in enumerate(src):
1609
        for j, tar_char in enumerate(tar):
1610
            if src_char == tar_char:
1611
                lengths[i+1, j+1] = lengths[i, j] + 1
1612
            else:
1613
                lengths[i+1, j+1] = max(lengths[i+1, j], lengths[i, j+1])
1614
1615
    # read the substring out from the matrix
1616
    result = ''
1617
    i, j = len(src), len(tar)
1618
    while i != 0 and j != 0:
1619
        if lengths[i, j] == lengths[i-1, j]:
1620
            i -= 1
1621
        elif lengths[i, j] == lengths[i, j-1]:
1622
            j -= 1
1623
        else:
1624
            result = src[i-1] + result
1625
            i -= 1
1626
            j -= 1
1627
    return result
1628
1629
1630
def sim_lcsseq(src, tar):
1631
    r"""Return the longest common subsequence similarity of two strings.
1632
1633
    Longest common subsequence similarity (:math:`sim_{LCSseq}`).
1634
1635
    This employs the LCSseq function to derive a similarity metric:
1636
    :math:`sim_{LCSseq}(s,t) = \\frac{|LCSseq(s,t)|}{max(|s|, |t|)}`
1637
1638
    :param str src, tar: two strings to be compared
1639
    :returns: LCSseq similarity
1640
    :rtype: float
1641
1642
    >>> sim_lcsseq('cat', 'hat')
1643
    0.6666666666666666
1644
    >>> sim_lcsseq('Niall', 'Neil')
1645
    0.6
1646
    >>> sim_lcsseq('aluminum', 'Catalan')
1647
    0.375
1648
    >>> sim_lcsseq('ATCG', 'TAGC')
1649
    0.5
1650
    """
1651
    if src == tar:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
1652
        return 1.0
1653
    elif not src or not tar:
1654
        return 0.0
1655
    return len(lcsseq(src, tar)) / max(len(src), len(tar))
1656
1657
1658
def dist_lcsseq(src, tar):
1659
    """Return the longest common subsequence distance between two strings.
1660
1661
    Longest common subsequence distance (:math:`dist_{LCSseq}`).
1662
1663
    This employs the LCSseq function to derive a similarity metric:
1664
    :math:`dist_{LCSseq}(s,t) = 1 - sim_{LCSseq}(s,t)`
1665
1666
    :param str src, tar: two strings to be compared
1667
    :returns: LCSseq distance
1668
    :rtype: float
1669
1670
    >>> dist_lcsseq('cat', 'hat')
1671
    0.33333333333333337
1672
    >>> dist_lcsseq('Niall', 'Neil')
1673
    0.4
1674
    >>> dist_lcsseq('aluminum', 'Catalan')
1675
    0.625
1676
    >>> dist_lcsseq('ATCG', 'TAGC')
1677
    0.5
1678
    """
1679
    return 1 - sim_lcsseq(src, tar)
1680
1681
1682
def lcsstr(src, tar):
1683
    """Return the longest common substring of two strings.
1684
1685
    Longest common substring (LCSstr).
1686
1687
    Based on the code from
1688
    https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring#Python
1689
    :cite:`Wikibooks:2018`.
1690
    This is licensed Creative Commons: Attribution-ShareAlike 3.0.
1691
1692
    Modifications include:
1693
1694
        - conversion to a numpy array in place of a list of lists
1695
        - conversion to Python 2/3-safe range from xrange via six
1696
1697
    :param str src, tar: two strings to be compared
1698
    :returns: the longes common substring
1699
    :rtype: float
1700
1701
    >>> lcsstr('cat', 'hat')
1702
    'at'
1703
    >>> lcsstr('Niall', 'Neil')
1704
    'N'
1705
    >>> lcsstr('aluminum', 'Catalan')
1706
    'al'
1707
    >>> lcsstr('ATCG', 'TAGC')
1708
    'A'
1709
    """
1710
    lengths = np_zeros((len(src)+1, len(tar)+1), dtype=np_int)
1711
    longest, i_longest = 0, 0
1712
    for i in range(1, len(src)+1):
1713
        for j in range(1, len(tar)+1):
1714
            if src[i-1] == tar[j-1]:
1715
                lengths[i, j] = lengths[i-1, j-1] + 1
1716
                if lengths[i, j] > longest:
1717
                    longest = lengths[i, j]
1718
                    i_longest = i
1719
            else:
1720
                lengths[i, j] = 0
1721
    return src[i_longest - longest:i_longest]
1722
1723
1724
def sim_lcsstr(src, tar):
1725
    r"""Return the longest common substring similarity of two strings.
1726
1727
    Longest common substring similarity (:math:`sim_{LCSstr}`).
1728
1729
    This employs the LCS function to derive a similarity metric:
1730
    :math:`sim_{LCSstr}(s,t) = \\frac{|LCSstr(s,t)|}{max(|s|, |t|)}`
1731
1732
    :param str src, tar: two strings to be compared
1733
    :returns: LCSstr similarity
1734
    :rtype: float
1735
1736
    >>> sim_lcsstr('cat', 'hat')
1737
    0.6666666666666666
1738
    >>> sim_lcsstr('Niall', 'Neil')
1739
    0.2
1740
    >>> sim_lcsstr('aluminum', 'Catalan')
1741
    0.25
1742
    >>> sim_lcsstr('ATCG', 'TAGC')
1743
    0.25
1744
    """
1745
    if src == tar:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
1746
        return 1.0
1747
    elif not src or not tar:
1748
        return 0.0
1749
    return len(lcsstr(src, tar)) / max(len(src), len(tar))
1750
1751
1752
def dist_lcsstr(src, tar):
1753
    """Return the longest common substring distance between two strings.
1754
1755
    Longest common substring distance (:math:`dist_{LCSstr}`).
1756
1757
    This employs the LCS function to derive a similarity metric:
1758
    :math:`dist_{LCSstr}(s,t) = 1 - sim_{LCSstr}(s,t)`
1759
1760
    :param str src, tar: two strings to be compared
1761
    :returns: LCSstr distance
1762
    :rtype: float
1763
1764
    >>> dist_lcsstr('cat', 'hat')
1765
    0.33333333333333337
1766
    >>> dist_lcsstr('Niall', 'Neil')
1767
    0.8
1768
    >>> dist_lcsstr('aluminum', 'Catalan')
1769
    0.75
1770
    >>> dist_lcsstr('ATCG', 'TAGC')
1771
    0.75
1772
    """
1773
    return 1 - sim_lcsstr(src, tar)
1774
1775
1776
def sim_ratcliff_obershelp(src, tar):
1777
    """Return the Ratcliff-Obershelp similarity of two strings.
1778
1779
    This follows the Ratcliff-Obershelp algorithm :cite:`Ratcliff:1988` to
1780
    derive a similarity measure:
1781
1782
        1. Find the length of the longest common substring in src & tar.
1783
        2. Recurse on the strings to the left & right of each this substring
1784
           in src & tar. The base case is a 0 length common substring, in which
1785
           case, return 0. Otherwise, return the sum of the current longest
1786
           common substring and the left & right recursed sums.
1787
        3. Multiply this length by 2 and divide by the sum of the lengths of
1788
           src & tar.
1789
1790
    Cf.
1791
    http://www.drdobbs.com/database/pattern-matching-the-gestalt-approach/184407970
1792
1793
    :param str src, tar: two strings to be compared
1794
    :returns: Ratcliff-Obserhelp similarity
1795
    :rtype: float
1796
1797
    >>> round(sim_ratcliff_obershelp('cat', 'hat'), 12)
1798
    0.666666666667
1799
    >>> round(sim_ratcliff_obershelp('Niall', 'Neil'), 12)
1800
    0.666666666667
1801
    >>> round(sim_ratcliff_obershelp('aluminum', 'Catalan'), 12)
1802
    0.4
1803
    >>> sim_ratcliff_obershelp('ATCG', 'TAGC')
1804
    0.5
1805
    """
1806
    def _lcsstr_stl(src, tar):
1807
        """Return start positions & length for Ratcliff-Obershelp.
1808
1809
        Return the start position in the source string, start position in
1810
        the target string, and length of the longest common substring of
1811
        strings src and tar.
1812
        """
1813
        lengths = np_zeros((len(src)+1, len(tar)+1), dtype=np_int)
1814
        longest, src_longest, tar_longest = 0, 0, 0
1815
        for i in range(1, len(src)+1):
1816
            for j in range(1, len(tar)+1):
1817
                if src[i-1] == tar[j-1]:
1818
                    lengths[i, j] = lengths[i-1, j-1] + 1
1819
                    if lengths[i, j] > longest:
1820
                        longest = lengths[i, j]
1821
                        src_longest = i
1822
                        tar_longest = j
1823
                else:
1824
                    lengths[i, j] = 0
1825
        return (src_longest-longest, tar_longest-longest, longest)
1826
1827
    def _sstr_matches(src, tar):
1828
        """Return the sum of substring match lengths.
1829
1830
        This follows the Ratcliff-Obershelp algorithm :cite:`Ratcliff:1988`:
1831
             1. Find the length of the longest common substring in src & tar.
1832
             2. Recurse on the strings to the left & right of each this
1833
                 substring in src & tar.
1834
             3. Base case is a 0 length common substring, in which case,
1835
                 return 0.
1836
             4. Return the sum.
1837
        """
1838
        src_start, tar_start, length = _lcsstr_stl(src, tar)
1839
        if length == 0:
1840
            return 0
1841
        return (_sstr_matches(src[:src_start], tar[:tar_start]) +
1842
                length +
1843
                _sstr_matches(src[src_start+length:], tar[tar_start+length:]))
1844
1845
    if src == tar:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
1846
        return 1.0
1847
    elif not src or not tar:
1848
        return 0.0
1849
    return 2*_sstr_matches(src, tar)/(len(src)+len(tar))
1850
1851
1852
def dist_ratcliff_obershelp(src, tar):
1853
    """Return the Ratcliff-Obershelp distance between two strings.
1854
1855
    Ratcliff-Obsershelp distance the complement of Ratcliff-Obershelp
1856
    similarity:
1857
    :math:`dist_{Ratcliff-Obershelp} = 1 - sim_{Ratcliff-Obershelp}`.
1858
1859
    :param str src, tar: two strings to be compared
1860
    :returns: Ratcliffe-Obershelp distance
1861
    :rtype: float
1862
1863
    >>> round(dist_ratcliff_obershelp('cat', 'hat'), 12)
1864
    0.333333333333
1865
    >>> round(dist_ratcliff_obershelp('Niall', 'Neil'), 12)
1866
    0.333333333333
1867
    >>> round(dist_ratcliff_obershelp('aluminum', 'Catalan'), 12)
1868
    0.6
1869
    >>> dist_ratcliff_obershelp('ATCG', 'TAGC')
1870
    0.5
1871
    """
1872
    return 1 - sim_ratcliff_obershelp(src, tar)
1873
1874
1875
def mra_compare(src, tar):
1876
    """Return the MRA comparison rating of two strings.
1877
1878
    The Western Airlines Surname Match Rating Algorithm comparison rating, as
1879
    presented on page 18 of :cite:`Moore:1977`.
1880
1881
    :param str src, tar: two strings to be compared
1882
    :returns: MRA comparison rating
1883
    :rtype: int
1884
1885
    >>> mra_compare('cat', 'hat')
1886
    5
1887
    >>> mra_compare('Niall', 'Neil')
1888
    6
1889
    >>> mra_compare('aluminum', 'Catalan')
1890
    0
1891
    >>> mra_compare('ATCG', 'TAGC')
1892
    5
1893
    """
1894
    if src == tar:
1895
        return 6
1896
    if src == '' or tar == '':
1897
        return 0
1898
    src = list(mra(src))
1899
    tar = list(mra(tar))
1900
1901
    if abs(len(src)-len(tar)) > 2:
1902
        return 0
1903
1904
    length_sum = len(src) + len(tar)
1905
    if length_sum < 5:
1906
        min_rating = 5
1907
    elif length_sum < 8:
1908
        min_rating = 4
1909
    elif length_sum < 12:
1910
        min_rating = 3
1911
    else:
1912
        min_rating = 2
1913
1914
    for _ in range(2):
1915
        new_src = []
1916
        new_tar = []
1917
        minlen = min(len(src), len(tar))
1918
        for i in range(minlen):
1919
            if src[i] != tar[i]:
1920
                new_src.append(src[i])
1921
                new_tar.append(tar[i])
1922
        src = new_src+src[minlen:]
1923
        tar = new_tar+tar[minlen:]
1924
        src.reverse()
1925
        tar.reverse()
1926
1927
    similarity = 6 - max(len(src), len(tar))
1928
1929
    if similarity >= min_rating:
1930
        return similarity
1931
    return 0
1932
1933
1934
def sim_mra(src, tar):
1935
    """Return the normalized MRA similarity of two strings.
1936
1937
    This is the MRA normalized to :math:`[0, 1]`, given that MRA itself is
1938
    constrained to the range :math:`[0, 6]`.
1939
1940
    :param str src, tar: two strings to be compared
1941
    :returns: normalized MRA similarity
1942
    :rtype: float
1943
1944
    >>> sim_mra('cat', 'hat')
1945
    0.8333333333333334
1946
    >>> sim_mra('Niall', 'Neil')
1947
    1.0
1948
    >>> sim_mra('aluminum', 'Catalan')
1949
    0.0
1950
    >>> sim_mra('ATCG', 'TAGC')
1951
    0.8333333333333334
1952
    """
1953
    return mra_compare(src, tar)/6
1954
1955
1956
def dist_mra(src, tar):
1957
    """Return the normalized MRA distance between two strings.
1958
1959
    MRA distance is the complement of MRA similarity:
1960
    :math:`dist_{MRA} = 1 - sim_{MRA}`.
1961
1962
    :param str src, tar: two strings to be compared
1963
    :returns: normalized MRA distance
1964
    :rtype: float
1965
1966
    >>> dist_mra('cat', 'hat')
1967
    0.16666666666666663
1968
    >>> dist_mra('Niall', 'Neil')
1969
    0.0
1970
    >>> dist_mra('aluminum', 'Catalan')
1971
    1.0
1972
    >>> dist_mra('ATCG', 'TAGC')
1973
    0.16666666666666663
1974
    """
1975
    return 1 - sim_mra(src, tar)
1976
1977
1978
def dist_compression(src, tar, compressor='bz2', probs=None):
1979
    """Return the normalized compression distance between two strings.
1980
1981
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
1982
1983
    :param str src, tar: two strings to be compared
1984
    :param str compressor: a compression scheme to use for the similarity
1985
        calculation, from the following:
1986
1987
            - `zlib` -- standard zlib/gzip
1988
            - `bz2` -- bzip2 (default)
1989
            - `lzma` -- Lempel–Ziv–Markov chain algorithm
1990
            - `arith` -- arithmetic coding
1991
            - `rle` -- run-length encoding
1992
            - `bwtrle` -- Burrows-Wheeler transform followed by run-length
1993
              encoding
1994
1995
    :param doct probs: a dictionary trained with ac_train (for the arith
1996
        compressor only)
1997
    :returns: compression distance
1998
    :rtype: float
1999
2000
    >>> dist_compression('cat', 'hat')
2001
    0.08
2002
    >>> dist_compression('Niall', 'Neil')
2003
    0.037037037037037035
2004
    >>> dist_compression('aluminum', 'Catalan')
2005
    0.20689655172413793
2006
    >>> dist_compression('ATCG', 'TAGC')
2007
    0.037037037037037035
2008
2009
    >>> dist_compression('Niall', 'Neil', compressor='zlib')
2010
    0.45454545454545453
2011
    >>> dist_compression('Niall', 'Neil', compressor='bz2')
2012
    0.037037037037037035
2013
    >>> dist_compression('Niall', 'Neil', compressor='lzma')
2014
    0.16
2015
    >>> dist_compression('Niall', 'Neil', compressor='arith')
2016
    0.6875
2017
    >>> dist_compression('Niall', 'Neil', compressor='rle')
2018
    1.0
2019
    >>> dist_compression('Niall', 'Neil', compressor='bwtrle')
2020
    0.8333333333333334
2021
    """
2022
    if src == tar:
2023
        return 0.0
2024
2025
    if compressor not in {'arith', 'rle', 'bwtrle'}:
2026
        src = src.encode('utf-8')
2027
        tar = tar.encode('utf-8')
2028
2029
    if compressor == 'bz2':
2030
        src_comp = encode(src, 'bz2_codec')[15:]
2031
        tar_comp = encode(tar, 'bz2_codec')[15:]
2032
        concat_comp = encode(src+tar, 'bz2_codec')[15:]
2033
        concat_comp2 = encode(tar+src, 'bz2_codec')[15:]
2034
    elif compressor == 'lzma':
2035
        if 'lzma' in modules:
2036
            src_comp = lzma.compress(src)[14:]
2037
            tar_comp = lzma.compress(tar)[14:]
2038
            concat_comp = lzma.compress(src+tar)[14:]
2039
            concat_comp2 = lzma.compress(tar+src)[14:]
2040
        else:
2041
            raise ValueError('Install the PylibLZMA module in order to use ' +
2042
                             'lzma compression similarity')
2043
    elif compressor == 'arith':
2044
        if probs is None:
2045
            # lacking a reasonable dictionary, train on the strings themselves
2046
            probs = ac_train(src+tar)
2047
        src_comp = ac_encode(src, probs)[1]
2048
        tar_comp = ac_encode(tar, probs)[1]
2049
        concat_comp = ac_encode(src+tar, probs)[1]
2050
        concat_comp2 = ac_encode(tar+src, probs)[1]
2051
        return ((min(concat_comp, concat_comp2) - min(src_comp, tar_comp)) /
2052
                max(src_comp, tar_comp))
2053
    elif compressor in {'rle', 'bwtrle'}:
2054
        src_comp = rle_encode(src, (compressor == 'bwtrle'))
2055
        tar_comp = rle_encode(tar, (compressor == 'bwtrle'))
2056
        concat_comp = rle_encode(src+tar, (compressor == 'bwtrle'))
2057
        concat_comp2 = rle_encode(tar+src, (compressor == 'bwtrle'))
2058
    else:  # zlib
2059
        src_comp = encode(src, 'zlib_codec')[2:]
2060
        tar_comp = encode(tar, 'zlib_codec')[2:]
2061
        concat_comp = encode(src+tar, 'zlib_codec')[2:]
2062
        concat_comp2 = encode(tar+src, 'zlib_codec')[2:]
2063
    return ((min(len(concat_comp), len(concat_comp2)) -
2064
             min(len(src_comp), len(tar_comp))) /
2065
            max(len(src_comp), len(tar_comp)))
2066
2067
2068
def sim_compression(src, tar, compressor='bz2', probs=None):
2069
    """Return the normalized compression similarity of two strings.
2070
2071
    Normalized compression similarity is the complement of normalized
2072
    compression distance:
2073
    :math:`sim_{NCS} = 1 - dist_{NCD}`.
2074
2075
    :param str src, tar: two strings to be compared
2076
    :param str compressor: a compression scheme to use for the similarity
2077
        calculation:
2078
2079
            - `zlib` -- standard zlib/gzip
2080
            - `bz2` -- bzip2 (default)
2081
            - `lzma` -- Lempel–Ziv–Markov chain algorithm
2082
            - `arith` -- arithmetic coding
2083
            - `rle` -- run-length encoding
2084
            - `bwtrle` -- Burrows-Wheeler transform followed by run-length
2085
              encoding
2086
2087
    :param dict probs: a dictionary trained with ac_train (for the arith
2088
        compressor only)
2089
    :returns: compression similarity
2090
    :rtype: float
2091
2092
    >>> sim_compression('cat', 'hat')
2093
    0.92
2094
    >>> sim_compression('Niall', 'Neil')
2095
    0.962962962962963
2096
    >>> sim_compression('aluminum', 'Catalan')
2097
    0.7931034482758621
2098
    >>> sim_compression('ATCG', 'TAGC')
2099
    0.962962962962963
2100
2101
    >>> sim_compression('Niall', 'Neil', compressor='zlib')
2102
    0.5454545454545454
2103
    >>> sim_compression('Niall', 'Neil', compressor='bz2')
2104
    0.962962962962963
2105
    >>> sim_compression('Niall', 'Neil', compressor='lzma')
2106
    0.84
2107
    >>> sim_compression('Niall', 'Neil', compressor='arith')
2108
    0.3125
2109
    >>> sim_compression('Niall', 'Neil', compressor='rle')
2110
    0.0
2111
    >>> sim_compression('Niall', 'Neil', compressor='bwtrle')
2112
    0.16666666666666663
2113
    """
2114
    return 1 - dist_compression(src, tar, compressor, probs)
2115
2116
2117
def sim_monge_elkan(src, tar, sim_func=sim_levenshtein, symmetric=False):
2118
    """Return the Monge-Elkan similarity of two strings.
2119
2120
    Monge-Elkan is defined in :cite:`Monge:1996`.
2121
2122
    Note: Monge-Elkan is NOT a symmetric similarity algoritm. Thus, the
2123
    similarity of src to tar is not necessarily equal to the similarity of
2124
    tar to src. If the sym argument is True, a symmetric value is calculated,
2125
    at the cost of doubling the computation time (since the
2126
    :math:`sim_{Monge-Elkan}(src, tar)` and
2127
    :math:`sim_{Monge-Elkan}(tar, src)` are both calculated and then averaged).
2128
2129
    :param str src, tar: two strings to be compared
2130
    :param function sim_func: the internal similarity metric to emply
2131
    :param bool symmetric: return a symmetric similarity measure
2132
    :returns: Monge-Elkan similarity
2133
    :rtype: float
2134
2135
    >>> sim_monge_elkan('cat', 'hat')
2136
    0.75
2137
    >>> round(sim_monge_elkan('Niall', 'Neil'), 12)
2138
    0.666666666667
2139
    >>> round(sim_monge_elkan('aluminum', 'Catalan'), 12)
2140
    0.388888888889
2141
    >>> sim_monge_elkan('ATCG', 'TAGC')
2142
    0.5
2143
    """
2144
    if src == tar:
2145
        return 1.0
2146
2147
    q_src = sorted(QGrams(src).elements())
2148
    q_tar = sorted(QGrams(tar).elements())
2149
2150
    if not q_src or not q_tar:
2151
        return 0.0
2152
2153
    sum_of_maxes = 0
2154
    for q_s in q_src:
2155
        max_sim = float('-inf')
2156
        for q_t in q_tar:
2157
            max_sim = max(max_sim, sim_func(q_s, q_t))
2158
        sum_of_maxes += max_sim
2159
    sim_em = sum_of_maxes / len(q_src)
2160
2161
    if symmetric:
2162
        sim_em = (sim_em + sim_monge_elkan(tar, src, sim, False))/2
2163
2164
    return sim_em
2165
2166
2167
def dist_monge_elkan(src, tar, sim_func=sim_levenshtein, symmetric=False):
2168
    """Return the Monge-Elkan distance between two strings.
2169
2170
    Monge-Elkan distance is the complement of Monge-Elkan similarity:
2171
    :math:`dist_{Monge-Elkan} = 1 - sim_{Monge-Elkan}`.
2172
2173
    :param str src, tar: two strings to be compared
2174
    :param function sim_func: the internal similarity metric to emply
2175
    :param bool symmetric: return a symmetric similarity measure
2176
    :returns: Monge-Elkan distance
2177
    :rtype: float
2178
2179
    >>> dist_monge_elkan('cat', 'hat')
2180
    0.25
2181
    >>> round(dist_monge_elkan('Niall', 'Neil'), 12)
2182
    0.333333333333
2183
    >>> round(dist_monge_elkan('aluminum', 'Catalan'), 12)
2184
    0.611111111111
2185
    >>> dist_monge_elkan('ATCG', 'TAGC')
2186
    0.5
2187
    """
2188
    return 1 - sim_monge_elkan(src, tar, sim_func, symmetric)
2189
2190
2191
def sim_ident(src, tar):
2192
    """Return the identity similarity of two strings.
2193
2194
    Identity similarity is 1 if the two strings are identical, otherwise 0.
2195
2196
    :param str src, tar: two strings to be compared
2197
    :returns: identity similarity
2198
    :rtype: int
2199
2200
    >>> sim_ident('cat', 'hat')
2201
    0
2202
    >>> sim_ident('cat', 'cat')
2203
    1
2204
    """
2205
    return int(src == tar)
2206
2207
2208
def dist_ident(src, tar):
2209
    """Return the identity distance between two strings.
2210
2211
    This is 0 if the two strings are identical, otherwise 1, i.e.
2212
    :math:`dist_{identity} = 1 - sim_{identity}`.
2213
2214
    :param str src, tar: two strings to be compared
2215
    :returns: indentity distance
2216
    :rtype: int
2217
2218
    >>> dist_ident('cat', 'hat')
2219
    1
2220
    >>> dist_ident('cat', 'cat')
2221
    0
2222
    """
2223
    return 1 - sim_ident(src, tar)
2224
2225
2226
def sim_matrix(src, tar, mat=None, mismatch_cost=0, match_cost=1,
0 ignored issues
show
best-practice introduced by
Too many arguments (7/5)
Loading history...
2227
               symmetric=True, alphabet=None):
2228
    """Return the matrix similarity of two strings.
2229
2230
    With the default parameters, this is identical to sim_ident.
2231
    It is possible for sim_matrix to return values outside of the range
2232
    :math:`[0, 1]`, if values outside that range are present in mat,
2233
    mismatch_cost, or match_cost.
2234
2235
    :param str src, tar: two strings to be compared
2236
    :param dict mat: a dict mapping tuples to costs; the tuples are (src, tar)
2237
        pairs of symbols from the alphabet parameter
2238
    :param float mismatch_cost: the value returned if (src, tar) is absent from
2239
        mat when src does not equal tar
2240
    :param float match_cost: the value returned if (src, tar) is absent from
2241
        mat when src equals tar
2242
    :param bool symmetric: True if the cost of src not matching tar is
2243
        identical to the cost of tar not matching src; in this case, the values
2244
        in mat need only contain (src, tar) or (tar, src), not both
2245
    :param str alphabet: a collection of tokens from which src and tar are
2246
        drawn; if this is defined a ValueError is raised if either tar or src
2247
        is not found in alphabet
2248
    :returns: matrix similarity
2249
    :rtype: float
2250
2251
    >>> sim_matrix('cat', 'hat')
2252
    0
2253
    >>> sim_matrix('hat', 'hat')
2254
    1
2255
    """
2256
    if alphabet:
2257
        alphabet = tuple(alphabet)
2258
        for i in src:
2259
            if i not in alphabet:
2260
                raise ValueError('src value not in alphabet')
2261
        for i in tar:
2262
            if i not in alphabet:
2263
                raise ValueError('tar value not in alphabet')
2264
2265
    if src == tar:
2266
        if mat and (src, src) in mat:
2267
            return mat[(src, src)]
2268
        return match_cost
2269
    if mat and (src, tar) in mat:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
2270
        return mat[(src, tar)]
2271
    elif symmetric and mat and (tar, src) in mat:
2272
        return mat[(tar, src)]
2273
    return mismatch_cost
2274
2275
2276 View Code Duplication
def needleman_wunsch(src, tar, gap_cost=1, sim_func=sim_ident):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
2277
    """Return the Needleman-Wunsch score of two strings.
2278
2279
    The Needleman-Wunsch score :cite:`Needleman:1970` is a standard edit
2280
    distance measure.
2281
2282
    :param str src, tar: two strings to be compared
2283
    :param float gap_cost: the cost of an alignment gap (1 by default)
2284
    :param function sim_func: a function that returns the similarity of two
2285
        characters (identity similarity by default)
2286
    :returns: Needleman-Wunsch score
2287
    :rtype: int (in fact dependent on the gap_cost & return value of sim_func)
2288
2289
    >>> needleman_wunsch('cat', 'hat')
2290
    2.0
2291
    >>> needleman_wunsch('Niall', 'Neil')
2292
    1.0
2293
    >>> needleman_wunsch('aluminum', 'Catalan')
2294
    -1.0
2295
    >>> needleman_wunsch('ATCG', 'TAGC')
2296
    0.0
2297
    """
2298
    d_mat = np_zeros((len(src)+1, len(tar)+1), dtype=np_float32)
2299
2300
    for i in range(len(src)+1):
2301
        d_mat[i, 0] = -(i * gap_cost)
2302
    for j in range(len(tar)+1):
2303
        d_mat[0, j] = -(j * gap_cost)
2304
    for i in range(1, len(src)+1):
2305
        for j in range(1, len(tar)+1):
2306
            match = d_mat[i-1, j-1] + sim_func(src[i-1], tar[j-1])
2307
            delete = d_mat[i-1, j] - gap_cost
2308
            insert = d_mat[i, j-1] - gap_cost
2309
            d_mat[i, j] = max(match, delete, insert)
2310
    return d_mat[d_mat.shape[0]-1, d_mat.shape[1]-1]
2311
2312
2313 View Code Duplication
def smith_waterman(src, tar, gap_cost=1, sim_func=sim_ident):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
2314
    """Return the Smith-Waterman score of two strings.
2315
2316
    The Smith-Waterman score :cite:`Smith:1981` is a standard edit distance
2317
    measure, differing from Needleman-Wunsch in that it focuses on local
2318
    alignment and disallows negative scores.
2319
2320
    :param str src, tar: two strings to be compared
2321
    :param float gap_cost: the cost of an alignment gap (1 by default)
2322
    :param function sim_func: a function that returns the similarity of two
2323
        characters (identity similarity by default)
2324
    :returns: Smith-Waterman score
2325
    :rtype: int (in fact dependent on the gap_cost & return value of sim_func)
2326
2327
    >>> smith_waterman('cat', 'hat')
2328
    2.0
2329
    >>> smith_waterman('Niall', 'Neil')
2330
    1.0
2331
    >>> smith_waterman('aluminum', 'Catalan')
2332
    0.0
2333
    >>> smith_waterman('ATCG', 'TAGC')
2334
    1.0
2335
    """
2336
    d_mat = np_zeros((len(src)+1, len(tar)+1), dtype=np_float32)
2337
2338
    for i in range(len(src)+1):
2339
        d_mat[i, 0] = 0
2340
    for j in range(len(tar)+1):
2341
        d_mat[0, j] = 0
2342
    for i in range(1, len(src)+1):
2343
        for j in range(1, len(tar)+1):
2344
            match = d_mat[i-1, j-1] + sim_func(src[i-1], tar[j-1])
2345
            delete = d_mat[i-1, j] - gap_cost
2346
            insert = d_mat[i, j-1] - gap_cost
2347
            d_mat[i, j] = max(0, match, delete, insert)
2348
    return d_mat[d_mat.shape[0]-1, d_mat.shape[1]-1]
2349
2350
2351
def gotoh(src, tar, gap_open=1, gap_ext=0.4, sim_func=sim_ident):
2352
    """Return the Gotoh score of two strings.
2353
2354
    The Gotoh score :cite:`Gotoh:1982` is essentially Needleman-Wunsch with
2355
    affine gap penalties.
2356
2357
    :param str src, tar: two strings to be compared
2358
    :param float gap_open: the cost of an open alignment gap (1 by default)
2359
    :param float gap_ext: the cost of an alignment gap extension (0.4 by
2360
        default)
2361
    :param function sim_func: a function that returns the similarity of two
2362
        characters (identity similarity by default)
2363
    :returns: Gotoh score
2364
    :rtype: float (in fact dependent on the gap_cost & return value of
2365
        sim_func)
2366
2367
    >>> gotoh('cat', 'hat')
2368
    2.0
2369
    >>> gotoh('Niall', 'Neil')
2370
    1.0
2371
    >>> round(gotoh('aluminum', 'Catalan'), 12)
2372
    -0.4
2373
    >>> gotoh('cat', 'hat')
2374
    2.0
2375
    """
2376
    d_mat = np_zeros((len(src)+1, len(tar)+1), dtype=np_float32)
2377
    p_mat = np_zeros((len(src)+1, len(tar)+1), dtype=np_float32)
2378
    q_mat = np_zeros((len(src)+1, len(tar)+1), dtype=np_float32)
2379
2380
    d_mat[0, 0] = 0
2381
    p_mat[0, 0] = float('-inf')
2382
    q_mat[0, 0] = float('-inf')
2383
    for i in range(1, len(src)+1):
2384
        d_mat[i, 0] = float('-inf')
2385
        p_mat[i, 0] = -gap_open - gap_ext*(i-1)
2386
        q_mat[i, 0] = float('-inf')
2387
        q_mat[i, 1] = -gap_open
2388
    for j in range(1, len(tar)+1):
2389
        d_mat[0, j] = float('-inf')
2390
        p_mat[0, j] = float('-inf')
2391
        p_mat[1, j] = -gap_open
2392
        q_mat[0, j] = -gap_open - gap_ext*(j-1)
2393
2394
    for i in range(1, len(src)+1):
2395
        for j in range(1, len(tar)+1):
2396
            sim_val = sim_func(src[i-1], tar[j-1])
2397
            d_mat[i, j] = max(d_mat[i-1, j-1] + sim_val,
2398
                              p_mat[i-1, j-1] + sim_val,
2399
                              q_mat[i-1, j-1] + sim_val)
2400
2401
            p_mat[i, j] = max(d_mat[i-1, j] - gap_open,
2402
                              p_mat[i-1, j] - gap_ext)
2403
2404
            q_mat[i, j] = max(d_mat[i, j-1] - gap_open,
2405
                              q_mat[i, j-1] - gap_ext)
2406
2407
    i, j = (n - 1 for n in d_mat.shape)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable n does not seem to be defined.
Loading history...
2408
    return max(d_mat[i, j], p_mat[i, j], q_mat[i, j])
2409
2410
2411
def sim_length(src, tar):
2412
    """Return the length similarty of two strings.
2413
2414
    Length similarity is the ratio of the length of the shorter string to the
2415
    longer.
2416
2417
    :param str src, tar: two strings to be compared
2418
    :returns: length similarity
2419
    :rtype: float
2420
2421
    >>> sim_length('cat', 'hat')
2422
    1.0
2423
    >>> sim_length('Niall', 'Neil')
2424
    0.8
2425
    >>> sim_length('aluminum', 'Catalan')
2426
    0.875
2427
    >>> sim_length('ATCG', 'TAGC')
2428
    1.0
2429
    """
2430
    if src == tar:
2431
        return 1.0
2432
    if not src or not tar:
2433
        return 0.0
2434
    return len(src)/len(tar) if len(src) < len(tar) else len(tar)/len(src)
2435
2436
2437
def dist_length(src, tar):
2438
    """Return the length distance between two strings.
2439
2440
    Length distance is the complement of length similarity:
2441
    :math:`dist_{length} = 1 - sim_{length}`.
2442
2443
    :param str src, tar: two strings to be compared
2444
    :returns: length distance
2445
    :rtype: float
2446
2447
    >>> dist_length('cat', 'hat')
2448
    0.0
2449
    >>> dist_length('Niall', 'Neil')
2450
    0.19999999999999996
2451
    >>> dist_length('aluminum', 'Catalan')
2452
    0.125
2453
    >>> dist_length('ATCG', 'TAGC')
2454
    0.0
2455
    """
2456
    return 1 - sim_length(src, tar)
2457
2458
2459 View Code Duplication
def sim_prefix(src, tar):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
2460
    """Return the prefix similarty of two strings.
2461
2462
    Prefix similarity is the ratio of the length of the shorter term that
2463
    exactly matches the longer term to the length of the shorter term,
2464
    beginning at the start of both terms.
2465
2466
    :param str src, tar: two strings to be compared
2467
    :returns: prefix similarity
2468
    :rtype: float
2469
2470
    >>> sim_prefix('cat', 'hat')
2471
    0.0
2472
    >>> sim_prefix('Niall', 'Neil')
2473
    0.25
2474
    >>> sim_prefix('aluminum', 'Catalan')
2475
    0.0
2476
    >>> sim_prefix('ATCG', 'TAGC')
2477
    0.0
2478
    """
2479
    if src == tar:
2480
        return 1.0
2481
    if not src or not tar:
2482
        return 0.0
2483
    min_word, max_word = (src, tar) if len(src) < len(tar) else (tar, src)
2484
    min_len = len(min_word)
2485
    for i in range(min_len, 0, -1):
2486
        if min_word[:i] == max_word[:i]:
2487
            return i/min_len
2488
    return 0.0
2489
2490
2491
def dist_prefix(src, tar):
2492
    """Return the prefix distance between two strings.
2493
2494
    Prefix distance is the complement of prefix similarity:
2495
    :math:`dist_{prefix} = 1 - sim_{prefix}`.
2496
2497
    :param str src, tar: two strings to be compared
2498
    :returns: prefix distance
2499
    :rtype: float
2500
2501
    >>> dist_prefix('cat', 'hat')
2502
    1.0
2503
    >>> dist_prefix('Niall', 'Neil')
2504
    0.75
2505
    >>> dist_prefix('aluminum', 'Catalan')
2506
    1.0
2507
    >>> dist_prefix('ATCG', 'TAGC')
2508
    1.0
2509
    """
2510
    return 1 - sim_prefix(src, tar)
2511
2512
2513 View Code Duplication
def sim_suffix(src, tar):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
2514
    """Return the suffix similarity of two strings.
2515
2516
    Suffix similarity is the ratio of the length of the shorter term that
2517
    exactly matches the longer term to the length of the shorter term,
2518
    beginning at the end of both terms.
2519
2520
    :param str src, tar: two strings to be compared
2521
    :returns: suffix similarity
2522
    :rtype: float
2523
2524
    >>> sim_suffix('cat', 'hat')
2525
    0.6666666666666666
2526
    >>> sim_suffix('Niall', 'Neil')
2527
    0.25
2528
    >>> sim_suffix('aluminum', 'Catalan')
2529
    0.0
2530
    >>> sim_suffix('ATCG', 'TAGC')
2531
    0.0
2532
    """
2533
    if src == tar:
2534
        return 1.0
2535
    if not src or not tar:
2536
        return 0.0
2537
    min_word, max_word = (src, tar) if len(src) < len(tar) else (tar, src)
2538
    min_len = len(min_word)
2539
    for i in range(min_len, 0, -1):
2540
        if min_word[-i:] == max_word[-i:]:
2541
            return i/min_len
2542
    return 0.0
2543
2544
2545
def dist_suffix(src, tar):
2546
    """Return the suffix distance between two strings.
2547
2548
    Suffix distance is the complement of suffix similarity:
2549
    :math:`dist_{suffix} = 1 - sim_{suffix}`.
2550
2551
    :param str src, tar: two strings to be compared
2552
    :returns: suffix distance
2553
    :rtype: float
2554
2555
    >>> dist_suffix('cat', 'hat')
2556
    0.33333333333333337
2557
    >>> dist_suffix('Niall', 'Neil')
2558
    0.75
2559
    >>> dist_suffix('aluminum', 'Catalan')
2560
    1.0
2561
    >>> dist_suffix('ATCG', 'TAGC')
2562
    1.0
2563
    """
2564
    return 1 - sim_suffix(src, tar)
2565
2566
2567
def sim_mlipns(src, tar, threshold=0.25, maxmismatches=2):
2568
    """Return the MLIPNS similarity of two strings.
2569
2570
    Modified Language-Independent Product Name Search (MLIPNS) is described in
2571
    :cite:`Shannaq:2010`. This function returns only 1.0 (similar) or 0.0
2572
    (not similar). LIPNS similarity is identical to normalized Hamming
2573
    similarity.
2574
2575
    :param str src, tar: two strings to be compared
2576
    :param float threshold: a number [0, 1] indicating the maximum similarity
2577
        score, below which the strings are considered 'similar' (0.25 by
2578
        default)
2579
    :param int maxmismatches: a number indicating the allowable number of
2580
        mismatches to remove before declaring two strings not similar (2 by
2581
        default)
2582
    :returns: MLIPNS similarity
2583
    :rtype: float
2584
2585
    >>> sim_mlipns('cat', 'hat')
2586
    1.0
2587
    >>> sim_mlipns('Niall', 'Neil')
2588
    0.0
2589
    >>> sim_mlipns('aluminum', 'Catalan')
2590
    0.0
2591
    >>> sim_mlipns('ATCG', 'TAGC')
2592
    0.0
2593
    """
2594
    if tar == src:
2595
        return 1.0
2596
    if not src or not tar:
2597
        return 0.0
2598
2599
    mismatches = 0
2600
    ham = hamming(src, tar, difflens=True)
2601
    maxlen = max(len(src), len(tar))
2602
    while src and tar and mismatches <= maxmismatches:
2603
        if maxlen < 1 or (1-(maxlen-ham)/maxlen) <= threshold:
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
2604
            return 1.0
2605
        else:
2606
            mismatches += 1
2607
            ham -= 1
2608
            maxlen -= 1
2609
2610
    if maxlen < 1:
2611
        return 1.0
2612
    return 0.0
2613
2614
2615
def dist_mlipns(src, tar, threshold=0.25, maxmismatches=2):
2616
    """Return the MLIPNS distance between two strings.
2617
2618
    MLIPNS distance is the complement of MLIPNS similarity:
2619
    :math:`dist_{MLIPNS} = 1 - sim_{MLIPNS}`. This function returns only 0.0
2620
    (distant) or 1.0 (not distant).
2621
2622
    :param str src, tar: two strings to be compared
2623
    :param float threshold: a number [0, 1] indicating the maximum similarity
2624
        score, below which the strings are considered 'similar' (0.25 by
2625
        default)
2626
    :param int maxmismatches: a number indicating the allowable number of
2627
        mismatches to remove before declaring two strings not similar (2 by
2628
        default)
2629
    :returns: MLIPNS distance
2630
    :rtype: float
2631
2632
    >>> dist_mlipns('cat', 'hat')
2633
    0.0
2634
    >>> dist_mlipns('Niall', 'Neil')
2635
    1.0
2636
    >>> dist_mlipns('aluminum', 'Catalan')
2637
    1.0
2638
    >>> dist_mlipns('ATCG', 'TAGC')
2639
    1.0
2640
    """
2641
    return 1.0 - sim_mlipns(src, tar, threshold, maxmismatches)
2642
2643
2644
def bag(src, tar):
2645
    """Return the bag distance between two strings.
2646
2647
    Bag distance is proposed in :cite:`Bartolini:2002`. It is defined as:
2648
    :math:`max(|multiset(src)-multiset(tar)|, |multiset(tar)-multiset(src)|)`.
2649
2650
    :param str src, tar: two strings to be compared
2651
    :returns: bag distance
2652
    :rtype: int
2653
2654
    >>> bag('cat', 'hat')
2655
    1
2656
    >>> bag('Niall', 'Neil')
2657
    2
2658
    >>> bag('aluminum', 'Catalan')
2659
    5
2660
    >>> bag('ATCG', 'TAGC')
2661
    0
2662
    >>> bag('abcdefg', 'hijklm')
2663
    7
2664
    >>> bag('abcdefg', 'hijklmno')
2665
    8
2666
    """
2667
    if tar == src:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
2668
        return 0
2669
    elif not src:
2670
        return len(tar)
2671
    elif not tar:
2672
        return len(src)
2673
2674
    src_bag = Counter(src)
2675
    tar_bag = Counter(tar)
2676
    return max(sum((src_bag-tar_bag).values()),
2677
               sum((tar_bag-src_bag).values()))
2678
2679
2680
def dist_bag(src, tar):
2681
    """Return the normalized bag distance between two strings.
2682
2683
    Bag distance is normalized by dividing by :math:`max( |src|, |tar| )`.
2684
2685
    :param str src, tar: two strings to be compared
2686
    :returns: normalized bag distance
2687
    :rtype: float
2688
2689
    >>> dist_bag('cat', 'hat')
2690
    0.3333333333333333
2691
    >>> dist_bag('Niall', 'Neil')
2692
    0.4
2693
    >>> dist_bag('aluminum', 'Catalan')
2694
    0.625
2695
    >>> dist_bag('ATCG', 'TAGC')
2696
    0.0
2697
    """
2698
    if tar == src:
2699
        return 0.0
2700
    if not src or not tar:
2701
        return 1.0
2702
2703
    maxlen = max(len(src), len(tar))
2704
2705
    return bag(src, tar)/maxlen
2706
2707
2708
def sim_bag(src, tar):
2709
    """Return the normalized bag similarity of two strings.
2710
2711
    Normalized bag similarity is the complement of normalized bag distance:
2712
    :math:`sim_{bag} = 1 - dist_{bag}`.
2713
2714
    :param str src, tar: two strings to be compared
2715
    :returns: normalized bag similarity
2716
    :rtype: float
2717
2718
    >>> round(sim_bag('cat', 'hat'), 12)
2719
    0.666666666667
2720
    >>> sim_bag('Niall', 'Neil')
2721
    0.6
2722
    >>> sim_bag('aluminum', 'Catalan')
2723
    0.375
2724
    >>> sim_bag('ATCG', 'TAGC')
2725
    1.0
2726
    """
2727
    return 1-dist_bag(src, tar)
2728
2729
2730
def editex(src, tar, cost=(0, 1, 2), local=False):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (16/15).
Loading history...
2731
    """Return the Editex distance between two strings.
2732
2733
    As described on pages 3 & 4 of :cite:`Zobel:1996`.
2734
2735
    The local variant is based on :cite:`Ring:2009`.
2736
2737
    :param str src, tar: two strings to be compared
2738
    :param tuple cost: a 3-tuple representing the cost of the four possible
2739
        edits:
2740
        match, same-group, and mismatch respectively (by default: (0, 1, 2))
2741
    :param bool local: if True, the local variant of Editex is used
2742
    :returns: Editex distance
2743
    :rtype: int
2744
2745
    >>> editex('cat', 'hat')
2746
    2
2747
    >>> editex('Niall', 'Neil')
2748
    2
2749
    >>> editex('aluminum', 'Catalan')
2750
    12
2751
    >>> editex('ATCG', 'TAGC')
2752
    6
2753
    """
2754
    match_cost, group_cost, mismatch_cost = cost
2755
    letter_groups = ({'A', 'E', 'I', 'O', 'U', 'Y'},
2756
                     {'B', 'P'},
2757
                     {'C', 'K', 'Q'},
2758
                     {'D', 'T'},
2759
                     {'L', 'R'},
2760
                     {'M', 'N'},
2761
                     {'G', 'J'},
2762
                     {'F', 'P', 'V'},
2763
                     {'S', 'X', 'Z'},
2764
                     {'C', 'S', 'Z'})
2765
    all_letters = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'J', 'K', 'L', 'M',
2766
                   'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'X', 'Y', 'Z'}
2767
2768
    def r_cost(ch1, ch2):
2769
        """Return r(a,b) according to Zobel & Dart's definition."""
2770
        if ch1 == ch2:
2771
            return match_cost
2772
        if ch1 in all_letters and ch2 in all_letters:
2773
            for group in letter_groups:
2774
                if ch1 in group and ch2 in group:
2775
                    return group_cost
2776
        return mismatch_cost
2777
2778
    def d_cost(ch1, ch2):
2779
        """Return d(a,b) according to Zobel & Dart's definition."""
2780
        if ch1 != ch2 and (ch1 == 'H' or ch1 == 'W'):
0 ignored issues
show
Unused Code introduced by
Consider merging these comparisons with "in" to "ch1 in ('H', 'W')"
Loading history...
2781
            return group_cost
2782
        return r_cost(ch1, ch2)
2783
2784
    # convert both src & tar to NFKD normalized unicode
2785
    src = normalize('NFKD', text_type(src.upper()))
2786
    tar = normalize('NFKD', text_type(tar.upper()))
2787
    # convert ß to SS (for Python2)
2788
    src = src.replace('ß', 'SS')
2789
    tar = tar.replace('ß', 'SS')
2790
2791
    if src == tar:
2792
        return 0
2793
    if not src:
2794
        return len(tar) * mismatch_cost
2795
    if not tar:
2796
        return len(src) * mismatch_cost
2797
2798
    d_mat = np_zeros((len(src)+1, len(tar)+1), dtype=np_int)
2799
    lens = len(src)
2800
    lent = len(tar)
2801
    src = ' '+src
2802
    tar = ' '+tar
2803
2804
    if not local:
2805
        for i in range(1, lens+1):
2806
            d_mat[i, 0] = d_mat[i-1, 0] + d_cost(src[i-1], src[i])
2807
    for j in range(1, lent+1):
2808
        d_mat[0, j] = d_mat[0, j-1] + d_cost(tar[j-1], tar[j])
2809
2810
    for i in range(1, lens+1):
2811
        for j in range(1, lent+1):
2812
            d_mat[i, j] = min(d_mat[i-1, j] + d_cost(src[i-1], src[i]),
2813
                              d_mat[i, j-1] + d_cost(tar[j-1], tar[j]),
2814
                              d_mat[i-1, j-1] + r_cost(src[i], tar[j]))
2815
2816
    return d_mat[lens, lent]
2817
2818
2819
def dist_editex(src, tar, cost=(0, 1, 2), local=False):
2820
    """Return the normalized Editex distance between two strings.
2821
2822
    The Editex distance is normalized by dividing the Editex distance
2823
    (calculated by any of the three supported methods) by the greater of
2824
    the number of characters in src times the cost of a delete and
2825
    the number of characters in tar times the cost of an insert.
2826
    For the case in which all operations have :math:`cost = 1`, this is
2827
    equivalent to the greater of the length of the two strings src & tar.
2828
2829
    :param str src, tar: two strings to be compared
2830
    :param tuple cost: a 3-tuple representing the cost of the four possible
2831
        edits:
2832
        match, same-group, and mismatch respectively (by default: (0, 1, 2))
2833
    :param bool local: if True, the local variant of Editex is used
2834
    :returns: normalized Editex distance
2835
    :rtype: float
2836
2837
    >>> round(dist_editex('cat', 'hat'), 12)
2838
    0.333333333333
2839
    >>> round(dist_editex('Niall', 'Neil'), 12)
2840
    0.2
2841
    >>> dist_editex('aluminum', 'Catalan')
2842
    0.75
2843
    >>> dist_editex('ATCG', 'TAGC')
2844
    0.75
2845
    """
2846
    if src == tar:
2847
        return 0
2848
    mismatch_cost = cost[2]
2849
    return (editex(src, tar, cost, local) /
2850
            (max(len(src)*mismatch_cost, len(tar)*mismatch_cost)))
2851
2852
2853
def sim_editex(src, tar, cost=(0, 1, 2), local=False):
2854
    """Return the normalized Editex similarity of two strings.
2855
2856
    The Editex similarity is the complement of Editex distance:
2857
    :math:`sim_{Editex} = 1 - dist_{Editex}`.
2858
2859
    :param str src, tar: two strings to be compared
2860
    :param tuple cost: a 3-tuple representing the cost of the four possible
2861
        edits:
2862
        match, same-group, and mismatch respectively (by default: (0, 1, 2))
2863
    :param bool local: if True, the local variant of Editex is used
2864
    :returns: normalized Editex similarity
2865
    :rtype: float
2866
2867
    >>> round(sim_editex('cat', 'hat'), 12)
2868
    0.666666666667
2869
    >>> round(sim_editex('Niall', 'Neil'), 12)
2870
    0.8
2871
    >>> sim_editex('aluminum', 'Catalan')
2872
    0.25
2873
    >>> sim_editex('ATCG', 'TAGC')
2874
    0.25
2875
    """
2876
    return 1 - dist_editex(src, tar, cost, local)
2877
2878
2879
def eudex_hamming(src, tar, weights='exponential', maxlength=8,
2880
                  normalized=False):
2881
    """Calculate the Hamming distance between the Eudex hashes of two terms.
2882
2883
    Cf. :cite:`Ticki:2016`.
2884
2885
    - If weights is set to None, a simple Hamming distance is calculated.
2886
    - If weights is set to 'exponential', weight decays by powers of 2, as
2887
      proposed in the eudex specification: https://github.com/ticki/eudex.
2888
    - If weights is set to 'fibonacci', weight decays through the Fibonacci
2889
      series, as in the eudex reference implementation.
2890
    - If weights is set to a callable function, this assumes it creates a
2891
      generator and the generator is used to populate a series of weights.
2892
    - If weights is set to an iterable, the iterable's values should be
2893
      integers and will be used as the weights.
2894
2895
    :param str src, tar: two strings to be compared
2896
    :param iterable or generator function weights:
2897
    :param maxlength: the number of characters to encode as a eudex hash
2898
    :return:
2899
    """
2900
    def _gen_fibonacci():
2901
        """Yield the next Fibonacci number.
2902
2903
        Based on https://www.python-course.eu/generators.php
2904
        Starts at Fibonacci number 3 (the second 1)
2905
        """
2906
        num_a, num_b = 1, 2
2907
        while True:
2908
            yield num_a
2909
            num_a, num_b = num_b, num_a + num_b
2910
2911
    def _gen_exponential(base=2):
2912
        """Yield the next value in an exponential series of the base.
2913
2914
        Starts at base**0
2915
        """
2916
        exp = 0
2917
        while True:
2918
            yield base ** exp
2919
            exp += 1
2920
2921
    # Calculate the eudex hashes and XOR them
2922
    xored = eudex(src, maxlength=maxlength) ^ eudex(tar, maxlength=maxlength)
2923
2924
    # Simple hamming distance (all bits are equal)
2925
    if not weights:
2926
        binary = bin(xored)
2927
        dist = binary.count('1')
0 ignored issues
show
Comprehensibility Bug introduced by
dist is re-defining a name which is already available in the outer-scope (previously defined on line 3852).

It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior:

param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
Loading history...
2928
        if normalized:
2929
            return dist/(len(binary)-2)
2930
        return dist
2931
2932
    # If weights is a function, it should create a generator,
2933
    # which we now use to populate a list
2934
    if callable(weights):
2935
        weights = weights()
2936
    elif weights == 'exponential':
2937
        weights = _gen_exponential()
2938
    elif weights == 'fibonacci':
2939
        weights = _gen_fibonacci()
2940
    if isinstance(weights, GeneratorType):
2941
        weights = [next(weights) for _ in range(maxlength)][::-1]
2942
2943
    # Sum the weighted hamming distance
2944
    dist = 0
2945
    maxdist = 0
2946
    while (xored or normalized) and weights:
2947
        maxdist += 8*weights[-1]
2948
        dist += bin(xored & 0xFF).count('1') * weights.pop()
2949
        xored >>= 8
2950
2951
    if normalized:
2952
        dist /= maxdist
2953
2954
    return dist
2955
2956
2957
def dist_eudex(src, tar, weights='exponential', maxlength=8):
2958
    """Return normalized Hamming distance between Eudex hashes of two terms.
2959
2960
    This is Eudex distance normalized to [0, 1].
2961
2962
    :param str src, tar: two strings to be compared
2963
    :param iterable or generator function weights:
2964
    :param maxlength: the number of characters to encode as a eudex hash
2965
    :return:
2966
    """
2967
    return eudex_hamming(src, tar, weights, maxlength, True)
2968
2969
2970
def sim_eudex(src, tar, weights='exponential', maxlength=8):
2971
    """Return normalized Hamming similarity between Eudex hashes of two terms.
2972
2973
    Normalized Eudex similarity is the complement of normalized Eudex distance:
2974
    :math:`sim_{Eudex} = 1 - dist_{Eudex}`.
2975
2976
    :param str src, tar: two strings to be compared
2977
    :param iterable or generator function weights:
2978
    :param maxlength: the number of characters to encode as a eudex hash
2979
    :return:
2980
    """
2981
    return 1-dist_eudex(src, tar, weights, maxlength)
2982
2983
2984
def sift4_simplest(src, tar, max_offset=5):
2985
    """Return the "simplest" Sift4 distance between two terms.
2986
2987
    This is an approximation of edit distance, described in
2988
    :cite:`Zackwehdex:2014`.
2989
2990
    :param str src, tar: two strings to be compared
2991
    :param max_offset: the number of characters to search for matching letters
2992
    :return:
2993
    """
2994
    if not src:
2995
        return len(tar)
2996
2997
    if not tar:
2998
        return len(src)
2999
3000
    src_len = len(src)
3001
    tar_len = len(tar)
3002
3003
    src_cur = 0
3004
    tar_cur = 0
3005
    lcss = 0
3006
    local_cs = 0
3007
3008
    while (src_cur < src_len) and (tar_cur < tar_len):
3009
        if src[src_cur] == tar[tar_cur]:
3010
            local_cs += 1
3011
        else:
3012
            lcss += local_cs
3013
            local_cs = 0
3014
            if src_cur != tar_cur:
3015
                src_cur = tar_cur = max(src_cur, tar_cur)
3016
            for i in range(max_offset):
3017
                if not ((src_cur+i < src_len) or (tar_cur+i < tar_len)):
3018
                    break
3019
                if (src_cur+i < src_len) and (src[src_cur+i] == tar[tar_cur]):
3020
                    src_cur += i
3021
                    local_cs += 1
3022
                    break
3023
                if (tar_cur+i < tar_len) and (src[src_cur] == tar[tar_cur+i]):
3024
                    tar_cur += i
3025
                    local_cs += 1
3026
                    break
3027
3028
        src_cur += 1
3029
        tar_cur += 1
3030
3031
    lcss += local_cs
3032
    return round(max(src_len, tar_len) - lcss)
3033
3034
3035
def sift4_common(src, tar, max_offset=5, max_distance=0):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (16/15).
Loading history...
3036
    """Return the "common" Sift4 distance between two terms.
3037
3038
    This is an approximation of edit distance, described in
3039
    :cite:`Zackwehdex:2014`.
3040
3041
    :param str src, tar: two strings to be compared
3042
    :param max_offset: the number of characters to search for matching letters
3043
    :param max_distance: the distance at which to stop and exit
3044
    :return:
3045
    """
3046
    if not src:
3047
        return len(tar)
3048
3049
    if not tar:
3050
        return len(src)
3051
3052
    src_len = len(src)
3053
    tar_len = len(tar)
3054
3055
    src_cur = 0
3056
    tar_cur = 0
3057
    lcss = 0
3058
    local_cs = 0
3059
    trans = 0
3060
    offset_arr = []
3061
3062
    while (src_cur < src_len) and (tar_cur < tar_len):
3063
        if src[src_cur] == tar[tar_cur]:
3064
            local_cs += 1
3065
            is_trans = False
3066
            i = 0
3067
            while i < len(offset_arr):
3068
                ofs = offset_arr[i]
3069
                if src_cur <= ofs['src_cur'] or tar_cur <= ofs['tar_cur']:
3070
                    is_trans = (abs(tar_cur-src_cur) >=
3071
                                abs(ofs['tar_cur']-ofs['src_cur']))
3072
                    if is_trans:
3073
                        trans += 1
3074
                    elif not ofs['trans']:
3075
                        ofs['trans'] = True
3076
                        trans += 1
3077
                    break
3078
                elif src_cur > ofs['tar_cur'] and tar_cur > ofs['src_cur']:
3079
                    del offset_arr[i]
3080
                else:
3081
                    i += 1
3082
3083
            offset_arr.append({'src_cur': src_cur, 'tar_cur': tar_cur,
3084
                               'trans': is_trans})
3085
        else:
3086
            lcss += local_cs
3087
            local_cs = 0
3088
            if src_cur != tar_cur:
3089
                src_cur = tar_cur = min(src_cur, tar_cur)
3090
            for i in range(max_offset):
3091
                if not ((src_cur+i < src_len) or (tar_cur+i < tar_len)):
3092
                    break
3093
                if (src_cur+i < src_len) and (src[src_cur+i] == tar[tar_cur]):
3094
                    src_cur += i-1
3095
                    tar_cur -= 1
3096
                    break
3097
                if (tar_cur+i < tar_len) and (src[src_cur] == tar[tar_cur+i]):
3098
                    src_cur -= 1
3099
                    tar_cur += i-1
3100
                    break
3101
3102
        src_cur += 1
3103
        tar_cur += 1
3104
3105
        if max_distance:
3106
            temporary_distance = max(src_cur, tar_cur) - lcss + trans
3107
            if temporary_distance >= max_distance:
3108
                return round(temporary_distance)
3109
3110
        if (src_cur >= src_len) or (tar_cur >= tar_len):
3111
            lcss += local_cs
3112
            local_cs = 0
3113
            src_cur = tar_cur = min(src_cur, tar_cur)
3114
3115
    lcss += local_cs
3116
    return round(max(src_len, tar_len) - lcss + trans)
3117
3118
3119
def dist_sift4(src, tar, max_offset=5, max_distance=0):
3120
    """Return the normalized "common" Sift4 distance between two terms.
3121
3122
    This is Sift4 distance, normalized to [0, 1].
3123
3124
    :param str src, tar: two strings to be compared
3125
    :param max_offset: the number of characters to search for matching letters
3126
    :param max_distance: the distance at which to stop and exit
3127
    :return:
3128
    """
3129
    return (sift4_common(src, tar, max_offset, max_distance) /
3130
            (max(len(src), len(tar), 1)))
3131
3132
3133
def sim_sift4(src, tar, max_offset=5, max_distance=0):
3134
    """Return the normalized "common" Sift4 similarity of two terms.
3135
3136
    Normalized Sift4 similarity is the complement of normalized Sift4 distance:
3137
    :math:`sim_{Sift4} = 1 - dist_{Sift4}`.
3138
3139
    :param str src, tar: two strings to be compared
3140
    :param max_offset: the number of characters to search for matching letters
3141
    :param max_distance: the distance at which to stop and exit
3142
    :return:
3143
    """
3144
    return 1-dist_sift4(src, tar, max_offset, max_distance)
3145
3146
3147
def sim_baystat(src, tar, min_ss_len=None, left_ext=None, right_ext=None):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (16/15).
Loading history...
3148
    """Return the Baystat similarity.
3149
3150
    Good results for shorter words are reported when setting min_ss_len to 1
3151
    and either left_ext OR right_ext to 1.
3152
3153
    The Baystat similarity is defined in :cite:`Furnohr:2002`.
3154
3155
    This is ostensibly a port of the R module PPRL's implementation:
3156
    https://github.com/cran/PPRL/blob/master/src/MTB_Baystat.cpp
3157
    :cite:`Rukasz:2018`. As such, this could be made more pythonic.
3158
3159
    :param str src, tar: two strings to be compared
3160
    :param int min_ss_len: minimum substring length to be considered
3161
    :param int left_ext: left-side extension length
3162
    :param int right_ext: right-side extension length
3163
    :rtype: float
3164
    :return: the Baystat similarity
3165
    """
3166
    if src == tar:
3167
        return 1
3168
    if not src or not tar:
3169
        return 0
3170
3171
    max_len = max(len(src), len(tar))
3172
3173
    if not (min_ss_len and left_ext and right_ext):
3174
        # These can be set via arguments to the function. Otherwise they are
3175
        # set automatically based on values from the article.
3176
        if max_len >= 7:
3177
            min_ss_len = 2
3178
            left_ext = 2
3179
            right_ext = 2
3180
        else:
3181
            # The paper suggests that for short names, (exclusively) one or the
3182
            # other of left_ext and right_ext can be 1, with good results.
3183
            # I use 0 & 0 as the default in this case.
3184
            min_ss_len = 1
3185
            left_ext = 0
3186
            right_ext = 0
3187
3188
    pos = 0
3189
    match_len = 0
3190
3191
    while (True):
0 ignored issues
show
Unused Code Coding Style introduced by
There is an unnecessary parenthesis after while.
Loading history...
3192
        if pos + min_ss_len > len(src):
3193
            return match_len/max_len
3194
3195
        hit_len = 0
3196
        ix = 1
0 ignored issues
show
Coding Style Naming introduced by
Variable name "ix" doesn't conform to snake_case naming style ('(([a-z_][a-z0-9_]2,)|(_[a-z0-9_]*)|(__[a-z][a-z0-9_]+__))$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
3197
3198
        substring = src[pos:pos + min_ss_len]
3199
        search_begin = pos - left_ext
3200
3201
        if search_begin < 0:
3202
            search_begin = 0
3203
            left_ext_len = pos
3204
        else:
3205
            left_ext_len = left_ext
3206
3207
        if pos + min_ss_len + right_ext >= len(tar):
3208
            right_ext_len = len(tar) - pos - min_ss_len
3209
        else:
3210
            right_ext_len = right_ext
3211
3212
        if (search_begin + left_ext_len + min_ss_len + right_ext_len >
3213
                search_begin):
3214
            search_val = tar[search_begin:(search_begin + left_ext_len +
3215
                                           min_ss_len + right_ext_len)]
3216
        else:
3217
            search_val = ''
3218
3219
        flagged_tar = ''
3220
        while substring in search_val and pos + ix <= len(src):
3221
            hit_len = len(substring)
3222
            flagged_tar = tar.replace(substring, '#'*hit_len)
3223
3224
            if pos + min_ss_len + ix <= len(src):
3225
                substring = src[pos:pos + min_ss_len + ix]
3226
3227
            if pos+min_ss_len + right_ext_len + 1 <= len(tar):
3228
                right_ext_len += 1
3229
3230
            if (search_begin + left_ext_len + min_ss_len + right_ext_len <=
3231
                    len(tar)):
3232
                search_val = tar[search_begin:(search_begin + left_ext_len +
3233
                                               min_ss_len + right_ext_len)]
3234
3235
            ix += 1
0 ignored issues
show
Coding Style Naming introduced by
Variable name "ix" doesn't conform to snake_case naming style ('(([a-z_][a-z0-9_]2,)|(_[a-z0-9_]*)|(__[a-z][a-z0-9_]+__))$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
3236
3237
        if hit_len > 0:
3238
            tar = flagged_tar
3239
3240
        match_len += hit_len
3241
        pos += ix
3242
3243
3244
def dist_baystat(src, tar, min_ss_len=None, left_ext=None, right_ext=None):
3245
    """Return the Baystat distance.
3246
3247
    Normalized Baystat similarity is the complement of normalized Baystat
3248
    distance: :math:`sim_{Baystat} = 1 - dist_{Baystat}`.
3249
3250
    :param str src, tar: two strings to be compared
3251
    :param int min_ss_len: minimum substring length to be considered
3252
    :param int left_ext: left-side extension length
3253
    :param int right_ext: right-side extension length
3254
    :rtype: float
3255
    :return: the Baystat distance
3256
    """
3257
    return 1-sim_baystat(src, tar, min_ss_len, left_ext, right_ext)
3258
3259
3260
def typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5), layout='QWERTY'):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (24/15).
Loading history...
3261
    """Return the typo distance between two strings.
3262
3263
    This is inspired by Typo-Distance :cite:`Song:2011`, and a fair bit of
3264
    this was copied from that module. Compared to the original, this supports
3265
    different metrics for substitution.
3266
3267
    :param str src, tar: two strings to be compared
3268
    :param str metric: supported values include: 'euclidean', 'manhattan',
3269
          'log-euclidean', and 'log-manhattan'
3270
    :param tuple cost: a 4-tuple representing the cost of the four possible
3271
        edits: inserts, deletes, substitutions, and shift, respectively (by
3272
        default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be
3273
        significantly less than the cost of an insertion & deletion unless
3274
        a log metric is used.
3275
    :return: typo distance
3276
    :rtype: float
3277
    """
3278
    ins_cost, del_cost, sub_cost, shift_cost = cost
3279
3280
    if src == tar:
3281
        return 0.0
3282
    if not src:
3283
        return len(tar) * ins_cost
3284
    if not tar:
3285
        return len(src) * del_cost
3286
3287
    kbs = {'QWERTY': (
3288
        (('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '-', '='),
3289
         ('', 'q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '[', ']',
3290
          '\\'),
3291
         ('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', ';', '\''),
3292
         ('', 'z', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '/')),
3293
        (('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+'),
3294
         ('', 'Q', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '{', '}', '|'),
3295
         ('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', ':', '"'),
3296
         ('', 'Z', 'X', 'C', 'V', 'B', 'N', 'M', '<', '>', '?'))
3297
    ), 'Dvorak': (
3298
        (('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '[', ']'),
3299
         ('', '\'', ',', '.', 'p', 'y', 'f', 'g', 'c', 'r', 'l', '/', '=',
3300
          '\\'),
3301
         ('', 'a', 'o', 'e', 'u', 'i', 'd', 'h', 't', 'n', 's', '-'),
3302
         ('', ';', 'q', 'j', 'k', 'x', 'b', 'm', 'w', 'v', 'z')),
3303
        (('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '{', '}'),
3304
         ('', '"', '<', '>', 'P', 'Y', 'F', 'G', 'C', 'R', 'L', '?', '+', '|'),
3305
         ('', 'A', 'O', 'E', 'U', 'I', 'D', 'H', 'T', 'N', 'S', '_'),
3306
         ('', ':', 'Q', 'J', 'K', 'X', 'B', 'M', 'W', 'V', 'Z'))
3307
    ), 'AZERTY': (
3308
        (('²', '&', 'é', '"', '\'', '(', '-', 'è', '_', 'ç', 'à', ')', '='),
3309
         ('', 'a', 'z', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '', '$'),
3310
         ('', 'q', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'ù', '*'),
3311
         ('<', 'w', 'x', 'c', 'v', 'b', 'n', ',', ';', ':', '!')),
3312
        (('~', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '°', '+'),
3313
         ('', 'A', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '', '£'),
3314
         ('', 'Q', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'Ù', 'μ'),
3315
         ('>', 'W', 'X', 'C', 'V', 'B', 'N', '?', '.', '/', '§'))
3316
    ), 'QWERTZ': (
3317
        (('', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'ß', ''),
3318
         ('', 'q', 'w', 'e', 'r', 't', 'z', 'u', 'i', 'o', 'p', ' ü', '+',
3319
          '\\'),
3320
         ('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'ö', 'ä', '#'),
3321
         ('<', 'y', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '-')),
3322
        (('°', '!', '"', '§', '$', '%', '&', '/', '(', ')', '=', '?', ''),
3323
         ('', 'Q', 'W', 'E', 'R', 'T', 'Z', 'U', 'I', 'O', 'P', 'Ü', '*', ''),
3324
         ('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Ö', 'Ä', '\''),
3325
         ('>', 'Y', 'X', 'C', 'V', 'B', 'N', 'M', ';', ':', '_'))
3326
    )}
3327
3328
    keyboard = kbs[layout]
3329
    lowercase = {item for sublist in keyboard[0] for item in sublist}
3330
    uppercase = {item for sublist in keyboard[1] for item in sublist}
3331
3332
    def _kb_array_for_char(char):
3333
        """Return the keyboard layout that contains ch."""
3334
        if char in lowercase:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
3335
            return keyboard[0]
3336
        elif char in uppercase:
3337
            return keyboard[1]
3338
        else:
3339
            raise ValueError(char + ' not found in any keyboard layouts')
3340
3341
    def _get_char_coord(char, keyboard):
3342
        """Return the row & column of char in the keyboard."""
3343
        for row in keyboard:
3344
            if char in row:
3345
                return keyboard.index(row), row.index(char)
3346
        raise ValueError(char + ' not found in given keyboard layout')
3347
3348
    def _euclidean_keyboard_distance(char1, char2):
3349
        row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1))
3350
        row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2))
3351
        return ((row1 - row2) ** 2 + (col1 - col2) ** 2) ** 0.5
3352
3353
    def _manhattan_keyboard_distance(char1, char2):
3354
        row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1))
3355
        row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2))
3356
        return abs(row1 - row2) + abs(col1 - col2)
3357
3358
    def _log_euclidean_keyboard_distance(char1, char2):
3359
        return log(1 + _euclidean_keyboard_distance(char1, char2))
3360
3361
    def _log_manhattan_keyboard_distance(char1, char2):
3362
        return log(1 + _manhattan_keyboard_distance(char1, char2))
3363
3364
    metric_dict = {'euclidean': _euclidean_keyboard_distance,
3365
                   'manhattan': _manhattan_keyboard_distance,
3366
                   'log-euclidean': _log_euclidean_keyboard_distance,
3367
                   'log-manhattan': _log_manhattan_keyboard_distance}
3368
3369
    def substitution_cost(char1, char2):
3370
        cost = sub_cost
3371
        cost *= (metric_dict[metric](char1, char2) +
3372
                 shift_cost * (_kb_array_for_char(char1) !=
3373
                               _kb_array_for_char(char2)))
3374
        return cost
3375
3376
    d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32)
3377
    for i in range(len(src) + 1):
3378
        d_mat[i, 0] = i * del_cost
3379
    for j in range(len(tar) + 1):
3380
        d_mat[0, j] = j * ins_cost
3381
3382
    for i in range(len(src)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
3383
        for j in range(len(tar)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
3384
            d_mat[i + 1, j + 1] = min(
3385
                d_mat[i + 1, j] + ins_cost,  # ins
3386
                d_mat[i, j + 1] + del_cost,  # del
3387
                d_mat[i, j] + (substitution_cost(src[i], tar[j])
3388
                               if src[i] != tar[j] else 0)  # sub/==
3389
            )
3390
3391
    return d_mat[len(src), len(tar)]
3392
3393
3394
def dist_typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5)):
3395
    """Return the normalized typo distance between two strings.
3396
3397
    This is typo distance, normalized to [0, 1].
3398
3399
    :param str src, tar: two strings to be compared
3400
    :param str metric: supported values include: 'euclidean', 'manhattan',
3401
          'log-euclidean', and 'log-manhattan'
3402
    :param tuple cost: a 4-tuple representing the cost of the four possible
3403
        edits: inserts, deletes, substitutions, and shift, respectively (by
3404
        default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be
3405
        significantly less than the cost of an insertion & deletion unless
3406
        a log metric is used.
3407
    :return: normalized typo distance
3408
    :rtype: float
3409
    """
3410
    if src == tar:
3411
        return 0
3412
    ins_cost, del_cost = cost[:2]
3413
    return (typo(src, tar, metric, cost) /
3414
            (max(len(src)*del_cost, len(tar)*ins_cost)))
3415
3416
3417
def sim_typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5)):
3418
    """Return the normalized typo similarity between two strings.
3419
3420
    Normalized typo similarity is the complement of normalized typo distance:
3421
    :math:`sim_{typo} = 1 - dist_{typo}`.
3422
3423
    :param str src, tar: two strings to be compared
3424
    :param str metric: supported values include: 'euclidean', 'manhattan',
3425
          'log-euclidean', and 'log-manhattan'
3426
    :param tuple cost: a 4-tuple representing the cost of the four possible
3427
        edits: inserts, deletes, substitutions, and shift, respectively (by
3428
        default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be
3429
        significantly less than the cost of an insertion & deletion unless
3430
        a log metric is used.
3431
    :return: normalized typo similarity
3432
    :rtype: float
3433
    """
3434
    return 1 - dist_typo(src, tar, metric, cost)
3435
3436
3437
def dist_indel(src, tar):
3438
    """Return the indel distance between two strings.
3439
3440
    This is equivalent to levenshtein distance, when only inserts and deletes
3441
    are possible.
3442
3443
    :param str src, tar: two strings to be compared
3444
    :return: indel distance
3445
    :rtype: float
3446
    """
3447
    return dist_levenshtein(src, tar, mode='lev', cost=(1, 1, 9999, 9999))
3448
3449
3450
def sim_indel(src, tar):
3451
    """Return the indel similarity of two strings.
3452
3453
    Normalized bag similarity is the complement of normalized bag distance:
3454
    :math:`sim_{bag} = 1 - dist_{bag}`
3455
3456
    :param str src, tar: two strings to be compared
3457
    :return: indel similarity
3458
    :rtype: float
3459
    """
3460
    return sim_levenshtein(src, tar, mode='lev', cost=(1, 1, 9999, 9999))
3461
3462
3463
def _synoname_strip_punct(word):
3464
    """Return a word with punctuation stripped out.
3465
3466
    :param word:
3467
    :return:
3468
    """
3469
    stripped = ''
3470
    for char in word:
3471
        if char not in set(',-./:;"&\'()!{|}?$%*+<=>[\\]^_`~'):
3472
            stripped += char
3473
    return stripped.strip()
3474
3475
3476
def synoname_word_approximation(src_ln, tar_ln, src_fn='', tar_fn='',
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (30/15).
Loading history...
best-practice introduced by
Too many return statements (10/6)
Loading history...
3477
                                features=None):
3478
    """Return the Synoname word approximation score for two names.
3479
3480
    :param str src_ln, tar_ln: last names of the source and target
3481
    :param str src_fn, tar_fn: first names of the source and target (optional)
3482
    :param features: a dict containing special features calculated via
3483
        fingerprint.synoname_toolcode() (optional)
3484
    :returns: The word approximation score
3485
    :rtype: float
3486
    """
3487
    if features is None:
3488
        features = {}
3489
    if 'src_specials' not in features:
3490
        features['src_specials'] = []
3491
    if 'tar_specials' not in features:
3492
        features['tar_specials'] = []
3493
3494
    src_len_specials = len(features['src_specials'])
3495
    tar_len_specials = len(features['tar_specials'])
3496
3497
    # 1
3498
    if ('gen_conflict' not in features or features['gen_conflict'] or
3499
            'roman_conflict' not in features or features['roman_conflict']):
3500
        return 0
3501
3502
    # 3 & 7
3503
    full_tar1 = ' '.join((tar_ln, tar_fn)).replace('-', ' ')
3504
    for s_type, s_pos in features['tar_specials']:
3505
        if s_pos == 'a':
3506
            full_tar1 = full_tar1[:1+len(_synoname_special_table[s_type][1])]
3507
        elif s_pos == 'b':
3508
            loc = full_tar1.find(' '+_synoname_special_table[s_type][1]+' ')+1
3509
            full_tar1 = (full_tar1[:loc] +
3510
                         full_tar1[loc +
3511
                                   len(_synoname_special_table[s_type][1]):])
3512
        elif s_pos == 'c':
3513
            full_tar1 = full_tar1[1+len(_synoname_special_table[s_type][1]):]
3514
3515
    full_src1 = ' '.join((src_ln, src_fn)).replace('-', ' ')
3516
    for s_type, s_pos in features['src_specials']:
3517
        if s_pos == 'a':
3518
            full_src1 = full_src1[:1+len(_synoname_special_table[s_type][1])]
3519
        elif s_pos == 'b':
3520
            loc = full_src1.find(' '+_synoname_special_table[s_type][1]+' ')+1
3521
            full_src1 = (full_src1[:loc] +
3522
                         full_src1[loc +
3523
                                   len(_synoname_special_table[s_type][1]):])
3524
        elif s_pos == 'c':
3525
            full_src1 = full_src1[1+len(_synoname_special_table[s_type][1]):]
3526
3527
    full_tar2 = full_tar1
3528
    for s_type, s_pos in features['tar_specials']:
3529
        if s_pos == 'd':
3530
            full_tar2 = full_tar2[len(_synoname_special_table[s_type][1]):]
3531
        elif s_pos == 'X' and _synoname_special_table[s_type][1] in full_tar2:
3532
            loc = full_tar2.find(' '+_synoname_special_table[s_type][1]+' ')+1
3533
            full_tar2 = (full_tar2[:loc] +
3534
                         full_tar2[loc +
3535
                                   len(_synoname_special_table[s_type][1]):])
3536
3537
    full_src2 = full_tar1
3538
    for s_type, s_pos in features['src_specials']:
3539
        if s_pos == 'd':
3540
            full_src2 = full_src2[len(_synoname_special_table[s_type][1]):]
3541
        elif s_pos == 'X' and _synoname_special_table[s_type][1] in full_src2:
3542
            loc = full_src2.find(' '+_synoname_special_table[s_type][1]+' ')+1
3543
            full_src2 = (full_src2[:loc] +
3544
                         full_src2[loc +
3545
                                   len(_synoname_special_table[s_type][1]):])
3546
3547
    full_tar1 = _synoname_strip_punct(full_tar1)
3548
    tar1_words = full_tar1.split()
3549
    tar1_num_words = len(tar1_words)
3550
3551
    full_src1 = _synoname_strip_punct(full_src1)
3552
    src1_words = full_src1.split()
3553
    src1_num_words = len(src1_words)
3554
3555
    full_tar2 = _synoname_strip_punct(full_tar2)
3556
    tar2_words = full_tar2.split()
3557
    tar2_num_words = len(tar2_words)
3558
3559
    full_src2 = _synoname_strip_punct(full_src2)
3560
    src2_words = full_src2.split()
3561
    src2_num_words = len(src2_words)
3562
3563
    # 2
3564
    if (src1_num_words < 2 and src_len_specials == 0 and src2_num_words < 2 and
3565
            tar_len_specials == 0):
3566
        return 0
3567
3568
    # 4
3569
    if (tar1_num_words == 1 and src1_num_words == 1 and
3570
            tar1_words[0] == src1_words[0]):
3571
        return 1
3572
    if tar1_num_words < 2 and tar_len_specials == 0:
3573
        return 0
3574
3575
    # 5
3576
    last_found = False
3577
    for word in tar1_words:
3578
        if src_ln.endswith(word) or word+' ' in src_ln:
3579
            last_found = True
3580
3581
    if not last_found:
3582
        for word in src1_words:
3583
            if tar_ln.endswith(word) or word+' ' in tar_ln:
3584
                last_found = True
3585
3586
    # 6
3587
    matches = 0
3588
    if last_found:
3589
        for i, s_word in enumerate(src1_words):
3590
            for j, t_word in enumerate(tar1_words):
3591
                if s_word == t_word:
3592
                    src1_words[i] = '@'
3593
                    tar1_words[j] = '@'
3594
                    matches += 1
3595
    w_ratio = matches/max(tar1_num_words, src1_num_words)
3596
    if matches > 1 or (matches == 1 and
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (6/5)
Loading history...
3597
                       src1_num_words == 1 and tar1_num_words == 1 and
3598
                       (tar_len_specials > 0 or src_len_specials > 0)):
3599
        return w_ratio
3600
3601
    # 8
3602
    if (tar2_num_words == 1 and src2_num_words == 1 and
3603
            tar2_words[0] == src2_words[0]):
3604
        return 1
3605
    if tar2_num_words < 2 and tar_len_specials == 0:
3606
        return 0
3607
3608
    # 9
3609
    last_found = False
3610
    for word in tar2_words:
3611
        if src_ln.endswith(word) or word+' ' in src_ln:
3612
            last_found = True
3613
3614
    if not last_found:
3615
        for word in src2_words:
3616
            if tar_ln.endswith(word) or word+' ' in tar_ln:
3617
                last_found = True
3618
3619
    if not last_found:
3620
        return 0
3621
3622
    # 10
3623
    matches = 0
3624
    if last_found:
3625
        for i, s_word in enumerate(src2_words):
3626
            for j, t_word in enumerate(tar2_words):
3627
                if s_word == t_word:
3628
                    src2_words[i] = '@'
3629
                    tar2_words[j] = '@'
3630
                    matches += 1
3631
    w_ratio = matches/max(tar2_num_words, src2_num_words)
3632
    if matches > 1 or (matches == 1 and
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (6/5)
Loading history...
3633
                       src2_num_words == 1 and tar2_num_words == 1 and
3634
                       (tar_len_specials > 0 or src_len_specials > 0)):
3635
        return w_ratio
3636
3637
    return 0
3638
3639
3640
def synoname(src, tar, word_approx_min=0.3, char_approx_min=0.73,
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (41/15).
Loading history...
best-practice introduced by
Too many return statements (17/6)
Loading history...
3641
             tests=2**12-1):
3642
    """Return the Synoname similarity type of two words.
3643
3644
    Cf. :cite:`Getty:1991,Gross:1991`
3645
3646
    :param str src, tar: two strings to be compared
3647
    :return: Synoname value
3648
    :rtype: int
3649
    """
3650
    test_dict = {val: 2**n for n, val in enumerate([
3651
        'exact', 'omission', 'substitution', 'transposition', 'punctuation',
3652
        'initials', 'extended', 'inclusion', 'no_first', 'word_approx',
3653
        'confusions', 'char_approx'])}
3654
    match_type_dict = {val: n for n, val in enumerate([
3655
        'exact', 'omission', 'substitution', 'transposition', 'punctuation',
3656
        'initials', 'extended', 'inclusion', 'no_first', 'word_approx',
3657
        'confusions', 'char_approx', 'no_match'], 1)}
3658
3659
    if isinstance(tests, Iterable):
3660
        new_tests = 0
3661
        for term in tests:
3662
            if term in test_dict:
3663
                new_tests += test_dict[term]
3664
        tests = new_tests
3665
3666
    if isinstance(src, tuple):
3667
        src_ln, src_fn, src_qual = src
3668
    elif '#' in src:
3669
        src_ln, src_fn, src_qual = src.split('#')[1:4]
3670
    else:
3671
        src_ln, src_fn, src_qual = src, '', ''
3672
3673
    if isinstance(tar, tuple):
3674
        tar_ln, tar_fn, tar_qual = tar
3675
    elif '#' in tar:
3676
        tar_ln, tar_fn, tar_qual = tar.split('#')[1:4]
3677
    else:
3678
        tar_ln, tar_fn, tar_qual = tar, '', ''
3679
3680
    def split_special(spec):
3681
        spec_list = []
3682
        while spec:
3683
            spec_list.append((int(spec[:3]), spec[3:4]))
3684
            spec = spec[4:]
3685
        return spec_list
3686
3687
    # 1. Preprocessing
3688
3689
    # Lowercasing
3690
    src_fn = src_fn.strip().lower()
3691
    src_ln = src_ln.strip().lower()
3692
    src_qual = src_qual.strip().lower()
3693
3694
    tar_fn = tar_fn.strip().lower()
3695
    tar_ln = tar_ln.strip().lower()
3696
    tar_qual = tar_qual.strip().lower()
3697
3698
    # Create toolcodes
3699
    src_fn, src_ln, src_tc = synoname_toolcode(src_fn, src_ln, src_qual)
3700
    tar_fn, tar_ln, tar_tc = synoname_toolcode(tar_fn, tar_ln, tar_qual)
3701
3702
    src_generation = int(src_tc[2])
3703
    src_romancode = int(src_tc[3:6])
3704
    src_len_fn = int(src_tc[6:8])
3705
    src_tc = src_tc.split('$')
3706
    src_specials = split_special(src_tc[1])
3707
3708
    tar_generation = int(tar_tc[2])
3709
    tar_romancode = int(tar_tc[3:6])
3710
    tar_len_fn = int(tar_tc[6:8])
3711
    tar_tc = tar_tc.split('$')
3712
    tar_specials = split_special(tar_tc[1])
3713
3714
    gen_conflict = (src_generation != tar_generation and
3715
                    (src_generation or tar_generation))
3716
    roman_conflict = (src_romancode != tar_romancode and
3717
                      (src_romancode or tar_romancode))
3718
3719
    ln_equal = src_ln == tar_ln
3720
    fn_equal = src_fn == tar_fn
3721
3722
    # approx_c
3723
    def approx_c():
3724
        if gen_conflict or roman_conflict:
3725
            return 0
3726
3727
        full_src = ' '.join((src_ln, src_fn))
3728
        if full_src.startswith('master '):
3729
            full_src = full_src[len('master '):]
3730
            for intro in ['of the ', 'of ', 'known as the ', 'with the ',
3731
                          'with ']:
3732
                if full_src.ssrctswith(intro):
0 ignored issues
show
Bug introduced by
The Instance of str does not seem to have a member named ssrctswith.

This check looks for calls to members that are non-existent. These calls will fail.

The member could have been renamed or removed.

Loading history...
3733
                    full_src = full_src[len(intro):]
3734
3735
        full_tar = ' '.join((tar_ln, tar_fn))
3736
        if full_tar.startswith('master '):
3737
            full_tar = full_tar[len('master '):]
3738
            for intro in ['of the ', 'of ', 'known as the ', 'with the ',
3739
                          'with ']:
3740
                if full_tar.startswith(intro):
3741
                    full_tar = full_tar[len(intro):]
3742
3743
        ca_ratio = sim_ratcliff_obershelp(full_src, full_tar)
3744
        return ca_ratio >= char_approx_min, ca_ratio
3745
3746
    approx_c_result, ca_ratio = approx_c()
0 ignored issues
show
Unused Code introduced by
The variable approx_c_result seems to be unused.
Loading history...
3747
3748
    if tests & test_dict['exact'] and fn_equal and ln_equal:
3749
        return match_type_dict['exact']
3750
    if tests & test_dict['omission']:
3751
        if (fn_equal and
3752
                levenshtein(src_ln, tar_ln, cost=(1, 1, 99, 99)) == 1):
3753
            if not roman_conflict:
3754
                return match_type_dict['omission']
3755
        elif (ln_equal and
3756
              levenshtein(src_fn, tar_fn, cost=(1, 1, 99, 99)) == 1):
3757
            return match_type_dict['omission']
3758
    if tests & test_dict['substitution']:
3759
        if (fn_equal and
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
3760
                levenshtein(src_ln, tar_ln, cost=(99, 99, 1, 99)) == 1):
3761
            return match_type_dict['substitution']
3762
        elif (ln_equal and
3763
              levenshtein(src_fn, tar_fn, cost=(99, 99, 1, 99)) == 1):
3764
            return match_type_dict['substitution']
3765
    if tests & test_dict['transposition']:
3766
        if (fn_equal and
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
3767
                levenshtein(src_ln, tar_ln, cost=(99, 99, 99, 1)) == 1):
3768
            return match_type_dict['transposition']
3769
        elif (ln_equal and
3770
              levenshtein(src_fn, tar_fn, cost=(99, 99, 99, 1)) == 1):
3771
            return match_type_dict['transposition']
3772
    if tests & test_dict['punctuation']:
3773
        np_src_fn = _synoname_strip_punct(src_fn)
3774
        np_tar_fn = _synoname_strip_punct(tar_fn)
3775
        np_src_ln = _synoname_strip_punct(src_ln)
3776
        np_tar_ln = _synoname_strip_punct(tar_ln)
3777
3778
        if np_src_fn == np_tar_fn and np_src_ln == np_tar_ln:
3779
            return match_type_dict['punctuation']
3780
    if tests & test_dict['initials'] and ln_equal:
3781
        if src_fn or tar_fn:
3782
            src_initials = ''.join(_[0] for _ in src_fn.split())
3783
            tar_initials = ''.join(_[0] for _ in tar_fn.split())
3784
            if src_initials == tar_initials:
3785
                return match_type_dict['initials']
3786
            initial_diff = abs(len(src_initials)-len(tar_initials))
3787
            if (initial_diff and
3788
                    ((initial_diff == levenshtein(src_initials, tar_initials,
3789
                                                  cost=(1, 99, 99, 99))) or
3790
                     (initial_diff == levenshtein(tar_initials, src_initials,
3791
                                                  cost=(1, 99, 99, 99))))):
3792
                return match_type_dict['initials']
3793
    if tests & test_dict['extended']:
3794
        if src_ln[0] == tar_ln[0] and (src_ln.startswith(tar_ln) or
3795
                                       tar_ln.startswith(src_ln)):
3796
            if ((not src_len_fn and not tar_len_fn) or
3797
                    src_ln.startswith(tar_ln) or
3798
                    tar_ln.startswith(src_ln)) and not roman_conflict:
3799
                return match_type_dict['extended']
3800
    if tests & test_dict['inclusion'] and ln_equal:
3801
        if src_fn in tar_fn or tar_fn in src_ln:
3802
            return match_type_dict['inclusion']
3803
    if tests & test_dict['no_first'] and ln_equal:
3804
        if src_fn == '' or tar_fn == '':
3805
            return match_type_dict['no_first']
3806
    if tests & test_dict['word_approx']:
3807
        ratio = synoname_word_approximation(src_ln, tar_ln, src_fn, tar_fn,
3808
                                            {'gen_conflict': gen_conflict,
3809
                                             'roman_conflict': roman_conflict,
3810
                                             'src_specials': src_specials,
3811
                                             'tar_specials': tar_specials})
3812
        if ratio == 1 and tests & test_dict['confusions']:
3813
            if ' '.join((src_fn, src_ln)) == ' '.join((tar_fn, tar_ln)):
3814
                return match_type_dict['confusions']
3815
        if ratio >= word_approx_min:
3816
            return match_type_dict['word_approx']
3817
    if tests & test_dict['char_approx']:
3818
        if ca_ratio >= char_approx_min:
3819
            return match_type_dict['char_approx']
3820
    return match_type_dict['no_match']
3821
3822
3823
###############################################################################
3824
3825
3826
def sim(src, tar, method=sim_levenshtein):
3827
    """Return a similarity of two strings.
3828
3829
    This is a generalized function for calling other similarity functions.
3830
3831
    :param str src, tar: two strings to be compared
3832
    :param function method: specifies the similarity metric (Levenshtein by
3833
        default)
3834
    :returns: similarity according to the specified function
3835
    :rtype: float
3836
3837
    >>> round(sim('cat', 'hat'), 12)
3838
    0.666666666667
3839
    >>> round(sim('Niall', 'Neil'), 12)
3840
    0.4
3841
    >>> sim('aluminum', 'Catalan')
3842
    0.125
3843
    >>> sim('ATCG', 'TAGC')
3844
    0.25
3845
    """
3846
    if callable(method):
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
3847
        return method(src, tar)
3848
    else:
3849
        raise AttributeError('Unknown similarity function: ' + str(method))
3850
3851
3852
def dist(src, tar, method=sim_levenshtein):
3853
    """Return a distance between two strings.
3854
3855
    This is a generalized function for calling other distance functions.
3856
3857
    :param str src, tar: two strings to be compared
3858
    :param function method: specifies the similarity metric (Levenshtein by
3859
        default) -- Note that this takes a similarity metric function, not
3860
        a distance metric function.
3861
    :returns: distance according to the specified function
3862
    :rtype: float
3863
3864
    >>> round(dist('cat', 'hat'), 12)
3865
    0.333333333333
3866
    >>> round(dist('Niall', 'Neil'), 12)
3867
    0.6
3868
    >>> dist('aluminum', 'Catalan')
3869
    0.875
3870
    >>> dist('ATCG', 'TAGC')
3871
    0.75
3872
    """
3873
    if callable(method):
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
3874
        return 1 - method(src, tar)
3875
    else:
3876
        raise AttributeError('Unknown distance function: ' + str(method))
3877
3878
3879
if __name__ == '__main__':
3880
    import doctest
3881
    doctest.testmod()
3882