Completed
Push — master ( 8fcab5...8fdcfc )
by Chris
19:24
created

abydos.distance.sim_tfidf()   C

Complexity

Conditions 9

Size

Total Lines 38
Code Lines 14

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 9
eloc 14
nop 5
dl 0
loc 38
rs 6.6666
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
0 ignored issues
show
coding-style introduced by
Too many lines in module (3850/1000)
Loading history...
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.distance.
20
21
The distance module implements string edit distance functions including:
22
23
    - Levenshtein distance
24
    - Optimal String Alignment distance
25
    - Levenshtein-Damerau distance
26
    - Hamming distance
27
    - Tversky index
28
    - Sørensen–Dice coefficient & distance
29
    - Jaccard similarity coefficient & distance
30
    - overlap similarity & distance
31
    - Tanimoto coefficient & distance
32
    - Minkowski distance & similarity
33
    - Manhattan distance & similarity
34
    - Euclidean distance & similarity
35
    - Chebyshev distance & similarity
36
    - cosine similarity & distance
37
    - Jaro distance
38
    - Jaro-Winkler distance (incl. the strcmp95 algorithm variant)
39
    - Longest common substring
40
    - Ratcliff-Obershelp similarity & distance
41
    - Match Rating Algorithm similarity
42
    - Normalized Compression Distance (NCD) & similarity
43
    - Monge-Elkan similarity & distance
44
    - Matrix similarity
45
    - Needleman-Wunsch score
46
    - Smither-Waterman score
47
    - Gotoh score
48
    - Length similarity
49
    - Prefix, Suffix, and Identity similarity & distance
50
    - Modified Language-Independent Product Name Search (MLIPNS) similarity &
51
      distance
52
    - Bag distance
53
    - Editex distance
54
    - Eudex distances
55
    - Sift4 distance
56
    - Baystat distance & similarity
57
    - Typo distance
58
    - Indel distance
59
    - Synoname
60
61
Functions beginning with the prefixes 'sim' and 'dist' are guaranteed to be
62
in the range [0, 1], and sim_X = 1 - dist_X since the two are complements.
63
If a sim_X function is supplied identical src & tar arguments, it is guaranteed
64
to return 1; the corresponding dist_X function is guaranteed to return 0.
65
"""
66
67
from __future__ import division, unicode_literals
68
69
from codecs import encode
70
from collections import Counter, Iterable, defaultdict
71
from math import log, sqrt
72
from sys import maxsize, modules
73
from types import GeneratorType
74
from unicodedata import normalize
75
76
from numpy import float32 as np_float32
0 ignored issues
show
introduced by
Unable to import 'numpy'
Loading history...
77
from numpy import int as np_int
0 ignored issues
show
introduced by
Unable to import 'numpy'
Loading history...
78
from numpy import zeros as np_zeros
0 ignored issues
show
introduced by
Unable to import 'numpy'
Loading history...
79
80
from six import text_type
0 ignored issues
show
introduced by
third party import "from six import text_type" should be placed before "from numpy import float32 as np_float32"
Loading history...
81
from six.moves import range
0 ignored issues
show
introduced by
third party import "from six.moves import range" should be placed before "from numpy import float32 as np_float32"
Loading history...
82
83
from .compression import ac_encode, ac_train, rle_encode
84
from .fingerprint import _synoname_special_table, synoname_toolcode
85
from .phonetic import eudex, mra
86
from .qgram import QGrams
87
88
try:
89
    import lzma
90
except ImportError:  # pragma: no cover
91
    # If the system lacks the lzma library, that's fine, but lzma comrpession
92
    # similarity won't be supported.
93
    pass
94
95
96
def levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
97
    """Return the Levenshtein distance between two strings.
98
99
    This is the standard edit distance measure. Cf.
100
    :cite:`Levenshtein:1965,Levenshtein:1966`.
101
102
    Two additional variants: optimal string alignment (aka restricted
103
    Damerau-Levenshtein distance) :cite:`Boytsov:2011` and the
104
    Damerau-Levenshtein :cite:`Damerau:1964` distance are also supported.
105
106
    The ordinary Levenshtein & Optimal String Alignment distance both
107
    employ the Wagner-Fischer dynamic programming algorithm
108
    :cite:`Wagner:1974`.
109
110
    Levenshtein edit distance ordinarily has unit insertion, deletion, and
111
    substitution costs.
112
113
    :param str src, tar: two strings to be compared
114
    :param str mode: specifies a mode for computing the Levenshtein distance:
115
116
        - 'lev' (default) computes the ordinary Levenshtein distance,
117
          in which edits may include inserts, deletes, and substitutions
118
        - 'osa' computes the Optimal String Alignment distance, in which
119
          edits may include inserts, deletes, substitutions, and
120
          transpositions but substrings may only be edited once
121
        - 'dam' computes the Damerau-Levenshtein distance, in which
122
          edits may include inserts, deletes, substitutions, and
123
          transpositions and substrings may undergo repeated edits
124
125
    :param tuple cost: a 4-tuple representing the cost of the four possible
126
        edits: inserts, deletes, substitutions, and transpositions,
127
        respectively (by default: (1, 1, 1, 1))
128
    :returns: the Levenshtein distance between src & tar
129
    :rtype: int (may return a float if cost has float values)
130
131
    >>> levenshtein('cat', 'hat')
132
    1
133
    >>> levenshtein('Niall', 'Neil')
134
    3
135
    >>> levenshtein('aluminum', 'Catalan')
136
    7
137
    >>> levenshtein('ATCG', 'TAGC')
138
    3
139
140
    >>> levenshtein('ATCG', 'TAGC', mode='osa')
141
    2
142
    >>> levenshtein('ACTG', 'TAGC', mode='osa')
143
    4
144
145
    >>> levenshtein('ATCG', 'TAGC', mode='dam')
146
    2
147
    >>> levenshtein('ACTG', 'TAGC', mode='dam')
148
    3
149
    """
150
    ins_cost, del_cost, sub_cost, trans_cost = cost
151
152
    if src == tar:
153
        return 0
154
    if not src:
155
        return len(tar) * ins_cost
156
    if not tar:
157
        return len(src) * del_cost
158
159
    if 'dam' in mode:
160
        return damerau_levenshtein(src, tar, cost)
161
162
    # pylint: disable=no-member
163
    d_mat = np_zeros((len(src)+1, len(tar)+1), dtype=np_int)
164
    # pylint: enable=no-member
165
    for i in range(len(src)+1):
166
        d_mat[i, 0] = i * del_cost
167
    for j in range(len(tar)+1):
168
        d_mat[0, j] = j * ins_cost
169
170
    for i in range(len(src)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
171
        for j in range(len(tar)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
172
            d_mat[i+1, j+1] = min(
173
                d_mat[i+1, j] + ins_cost,  # ins
174
                d_mat[i, j+1] + del_cost,  # del
175
                d_mat[i, j] + (sub_cost if src[i] != tar[j] else 0)  # sub/==
176
            )
177
178
            if mode == 'osa':
179
                if ((i+1 > 1 and j+1 > 1 and src[i] == tar[j-1] and
180
                     src[i-1] == tar[j])):
181
                    # transposition
182
                    d_mat[i+1, j+1] = min(d_mat[i+1, j+1],
183
                                          d_mat[i-1, j-1] + trans_cost)
184
185
    return d_mat[len(src), len(tar)]
186
187
188
def dist_levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
189
    """Return the normalized Levenshtein distance between two strings.
190
191
    The Levenshtein distance is normalized by dividing the Levenshtein distance
192
    (calculated by any of the three supported methods) by the greater of
193
    the number of characters in src times the cost of a delete and
194
    the number of characters in tar times the cost of an insert.
195
    For the case in which all operations have :math:`cost = 1`, this is
196
    equivalent to the greater of the length of the two strings src & tar.
197
198
    :param str src, tar: two strings to be compared
199
    :param str mode: specifies a mode for computing the Levenshtein distance:
200
201
        - 'lev' (default) computes the ordinary Levenshtein distance,
202
          in which edits may include inserts, deletes, and substitutions
203
        - 'osa' computes the Optimal String Alignment distance, in which
204
          edits may include inserts, deletes, substitutions, and
205
          transpositions but substrings may only be edited once
206
        - 'dam' computes the Damerau-Levenshtein distance, in which
207
          edits may include inserts, deletes, substitutions, and
208
          transpositions and substrings may undergo repeated edits
209
210
    :param tuple cost: a 4-tuple representing the cost of the four possible
211
        edits: inserts, deletes, substitutions, and transpositions,
212
        respectively (by default: (1, 1, 1, 1))
213
    :returns: normalized Levenshtein distance
214
    :rtype: float
215
216
    >>> round(dist_levenshtein('cat', 'hat'), 12)
217
    0.333333333333
218
    >>> round(dist_levenshtein('Niall', 'Neil'), 12)
219
    0.6
220
    >>> dist_levenshtein('aluminum', 'Catalan')
221
    0.875
222
    >>> dist_levenshtein('ATCG', 'TAGC')
223
    0.75
224
    """
225
    if src == tar:
226
        return 0
227
    ins_cost, del_cost = cost[:2]
228
    return (levenshtein(src, tar, mode, cost) /
229
            (max(len(src)*del_cost, len(tar)*ins_cost)))
230
231
232
def sim_levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
233
    """Return the Levenshtein similarity of two strings.
234
235
    Normalized Levenshtein similarity is the complement of normalized
236
    Levenshtein distance:
237
    :math:`sim_{Levenshtein} = 1 - dist_{Levenshtein}`.
238
239
    :param str src, tar: two strings to be compared
240
    :param str mode: specifies a mode for computing the Levenshtein distance:
241
242
            - 'lev' (default) computes the ordinary Levenshtein distance,
243
              in which edits may include inserts, deletes, and substitutions
244
            - 'osa' computes the Optimal String Alignment distance, in which
245
              edits may include inserts, deletes, substitutions, and
246
              transpositions but substrings may only be edited once
247
            - 'dam' computes the Damerau-Levenshtein distance, in which
248
              edits may include inserts, deletes, substitutions, and
249
              transpositions and substrings may undergo repeated edits
250
251
    :param tuple cost: a 4-tuple representing the cost of the four possible
252
        edits:
253
        inserts, deletes, substitutions, and transpositions, respectively
254
        (by default: (1, 1, 1, 1))
255
    :returns: normalized Levenshtein similarity
256
    :rtype: float
257
258
    >>> round(sim_levenshtein('cat', 'hat'), 12)
259
    0.666666666667
260
    >>> round(sim_levenshtein('Niall', 'Neil'), 12)
261
    0.4
262
    >>> sim_levenshtein('aluminum', 'Catalan')
263
    0.125
264
    >>> sim_levenshtein('ATCG', 'TAGC')
265
    0.25
266
    """
267
    return 1 - dist_levenshtein(src, tar, mode, cost)
268
269
270
def damerau_levenshtein(src, tar, cost=(1, 1, 1, 1)):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (20/15).
Loading history...
271
    """Return the Damerau-Levenshtein distance between two strings.
272
273
    This computes the Damerau-Levenshtein distance :cite:`Damerau:1964`.
274
    Damerau-Levenshtein code is based on Java code by Kevin L. Stern
275
    :cite:`Stern:2014`, under the MIT license:
276
    https://github.com/KevinStern/software-and-algorithms/blob/master/src/main/java/blogspot/software_and_algorithms/stern_library/string/DamerauLevenshteinAlgorithm.java
277
278
    :param str src, tar: two strings to be compared
279
    :param tuple cost: a 4-tuple representing the cost of the four possible
280
        edits:
281
        inserts, deletes, substitutions, and transpositions, respectively
282
        (by default: (1, 1, 1, 1))
283
    :returns: the Damerau-Levenshtein distance between src & tar
284
    :rtype: int (may return a float if cost has float values)
285
286
    >>> damerau_levenshtein('cat', 'hat')
287
    1
288
    >>> damerau_levenshtein('Niall', 'Neil')
289
    3
290
    >>> damerau_levenshtein('aluminum', 'Catalan')
291
    7
292
    >>> damerau_levenshtein('ATCG', 'TAGC')
293
    2
294
    """
295
    ins_cost, del_cost, sub_cost, trans_cost = cost
296
297
    if src == tar:
298
        return 0
299
    if not src:
300
        return len(tar) * ins_cost
301
    if not tar:
302
        return len(src) * del_cost
303
304
    if 2*trans_cost < ins_cost + del_cost:
305
        raise ValueError('Unsupported cost assignment; the cost of two ' +
306
                         'transpositions must not be less than the cost of ' +
307
                         'an insert plus a delete.')
308
309
    # pylint: disable=no-member
310
    d_mat = (np_zeros((len(src))*(len(tar)), dtype=np_int).
311
             reshape((len(src), len(tar))))
312
    # pylint: enable=no-member
313
314
    if src[0] != tar[0]:
315
        d_mat[0, 0] = min(sub_cost, ins_cost + del_cost)
316
317
    src_index_by_character = {}
318
    src_index_by_character[src[0]] = 0
319
    for i in range(1, len(src)):
320
        del_distance = d_mat[i-1, 0] + del_cost
321
        ins_distance = (i+1) * del_cost + ins_cost
322
        match_distance = (i * del_cost +
323
                          (0 if src[i] == tar[0] else sub_cost))
324
        d_mat[i, 0] = min(del_distance, ins_distance, match_distance)
325
326
    for j in range(1, len(tar)):
327
        del_distance = (j+1) * ins_cost + del_cost
328
        ins_distance = d_mat[0, j-1] + ins_cost
329
        match_distance = (j * ins_cost +
330
                          (0 if src[0] == tar[j] else sub_cost))
331
        d_mat[0, j] = min(del_distance, ins_distance, match_distance)
332
333
    for i in range(1, len(src)):
334
        max_src_letter_match_index = (0 if src[i] == tar[0] else -1)
335
        for j in range(1, len(tar)):
336
            candidate_swap_index = (-1 if tar[j] not in
337
                                    src_index_by_character else
338
                                    src_index_by_character[tar[j]])
339
            j_swap = max_src_letter_match_index
340
            del_distance = d_mat[i-1, j] + del_cost
341
            ins_distance = d_mat[i, j-1] + ins_cost
342
            match_distance = d_mat[i-1, j-1]
343
            if src[i] != tar[j]:
344
                match_distance += sub_cost
345
            else:
346
                max_src_letter_match_index = j
347
348
            if candidate_swap_index != -1 and j_swap != -1:
349
                i_swap = candidate_swap_index
350
351
                if i_swap == 0 and j_swap == 0:
352
                    pre_swap_cost = 0
353
                else:
354
                    pre_swap_cost = d_mat[max(0, i_swap-1), max(0, j_swap-1)]
355
                swap_distance = (pre_swap_cost + (i - i_swap - 1) *
356
                                 del_cost + (j - j_swap - 1) * ins_cost +
357
                                 trans_cost)
358
            else:
359
                swap_distance = maxsize
360
361
            d_mat[i, j] = min(del_distance, ins_distance,
362
                              match_distance, swap_distance)
363
        src_index_by_character[src[i]] = i
364
365
    return d_mat[len(src)-1, len(tar)-1]
366
367
368
def dist_damerau(src, tar, cost=(1, 1, 1, 1)):
369
    """Return the Damerau-Levenshtein similarity of two strings.
370
371
    Damerau-Levenshtein distance normalized to the interval [0, 1].
372
373
    The Damerau-Levenshtein distance is normalized by dividing the
374
    Damerau-Levenshtein distance by the greater of
375
    the number of characters in src times the cost of a delete and
376
    the number of characters in tar times the cost of an insert.
377
    For the case in which all operations have :math:`cost = 1`, this is
378
    equivalent to the greater of the length of the two strings src & tar.
379
380
    The arguments are identical to those of the levenshtein() function.
381
382
    :param str src, tar: two strings to be compared
383
    :param tuple cost: a 4-tuple representing the cost of the four possible
384
        edits:
385
        inserts, deletes, substitutions, and transpositions, respectively
386
        (by default: (1, 1, 1, 1))
387
    :returns: normalized Damerau-Levenshtein distance
388
    :rtype: float
389
390
    >>> round(dist_damerau('cat', 'hat'), 12)
391
    0.333333333333
392
    >>> round(dist_damerau('Niall', 'Neil'), 12)
393
    0.6
394
    >>> dist_damerau('aluminum', 'Catalan')
395
    0.875
396
    >>> dist_damerau('ATCG', 'TAGC')
397
    0.5
398
    """
399
    if src == tar:
400
        return 0
401
    ins_cost, del_cost = cost[:2]
402
    return (damerau_levenshtein(src, tar, cost) /
403
            (max(len(src)*del_cost, len(tar)*ins_cost)))
404
405
406
def sim_damerau(src, tar, cost=(1, 1, 1, 1)):
407
    """Return the Damerau-Levenshtein similarity of two strings.
408
409
    Normalized Damerau-Levenshtein similarity the complement of normalized
410
    Damerau-Levenshtein distance:
411
    :math:`sim_{Damerau} = 1 - dist_{Damerau}`.
412
413
    The arguments are identical to those of the levenshtein() function.
414
415
    :param str src, tar: two strings to be compared
416
    :param tuple cost: a 4-tuple representing the cost of the four possible
417
        edits:
418
        inserts, deletes, substitutions, and transpositions, respectively
419
        (by default: (1, 1, 1, 1))
420
    :returns: normalized Damerau-Levenshtein similarity
421
    :rtype: float
422
423
    >>> round(sim_damerau('cat', 'hat'), 12)
424
    0.666666666667
425
    >>> round(sim_damerau('Niall', 'Neil'), 12)
426
    0.4
427
    >>> sim_damerau('aluminum', 'Catalan')
428
    0.125
429
    >>> sim_damerau('ATCG', 'TAGC')
430
    0.5
431
    """
432
    return 1 - dist_damerau(src, tar, cost)
433
434
435
def hamming(src, tar, difflens=True):
436
    """Return the Hamming distance between two strings.
437
438
    Hamming distance :cite:`Hamming:1950` equals the number of character
439
    positions at which two strings differ. For strings of unequal lengths,
440
    it is not normally defined. By default, this implementation calculates the
441
    Hamming distance of the first n characters where n is the lesser of the two
442
    strings' lengths and adds to this the difference in string lengths.
443
444
    :param str src, tar: two strings to be compared
445
    :param bool allow_different_lengths:
446
        If True (default), this returns the Hamming distance for those
447
        characters that have a matching character in both strings plus the
448
        difference in the strings' lengths. This is equivalent to extending
449
        the shorter string with obligatorily non-matching characters.
450
        If False, an exception is raised in the case of strings of unequal
451
        lengths.
452
    :returns: the Hamming distance between src & tar
453
    :rtype: int
454
455
    >>> hamming('cat', 'hat')
456
    1
457
    >>> hamming('Niall', 'Neil')
458
    3
459
    >>> hamming('aluminum', 'Catalan')
460
    8
461
    >>> hamming('ATCG', 'TAGC')
462
    4
463
    """
464
    if not difflens and len(src) != len(tar):
465
        raise ValueError('Undefined for sequences of unequal length; set ' +
466
                         'difflens to True for Hamming distance between ' +
467
                         'strings of unequal lengths.')
468
469
    hdist = 0
470
    if difflens:
471
        hdist += abs(len(src)-len(tar))
472
    hdist += sum(c1 != c2 for c1, c2 in zip(src, tar))
473
474
    return hdist
475
476
477
def dist_hamming(src, tar, difflens=True):
478
    """Return the normalized Hamming distance between two strings.
479
480
    Hamming distance normalized to the interval [0, 1].
481
482
    The Hamming distance is normalized by dividing it
483
    by the greater of the number of characters in src & tar (unless difflens is
484
    set to False, in which case an exception is raised).
485
486
    The arguments are identical to those of the hamming() function.
487
488
    :param str src, tar: two strings to be compared
489
    :param bool allow_different_lengths:
490
        If True (default), this returns the Hamming distance for those
491
        characters that have a matching character in both strings plus the
492
        difference in the strings' lengths. This is equivalent to extending
493
        the shorter string with obligatorily non-matching characters.
494
        If False, an exception is raised in the case of strings of unequal
495
        lengths.
496
    :returns: normalized Hamming distance
497
    :rtype: float
498
499
    >>> round(dist_hamming('cat', 'hat'), 12)
500
    0.333333333333
501
    >>> dist_hamming('Niall', 'Neil')
502
    0.6
503
    >>> dist_hamming('aluminum', 'Catalan')
504
    1.0
505
    >>> dist_hamming('ATCG', 'TAGC')
506
    1.0
507
    """
508
    if src == tar:
509
        return 0
510
    return hamming(src, tar, difflens) / max(len(src), len(tar))
511
512
513
def sim_hamming(src, tar, difflens=True):
514
    """Return the normalized Hamming similarity of two strings.
515
516
    Hamming similarity normalized to the interval [0, 1].
517
518
    Hamming similarity is the complement of normalized Hamming distance:
519
    :math:`sim_{Hamming} = 1 - dist{Hamming}`.
520
521
    Provided that difflens==True, the Hamming similarity is identical to the
522
    Language-Independent Product Name Search (LIPNS) similarity score. For
523
    further information, see the sim_mlipns documentation.
524
525
    The arguments are identical to those of the hamming() function.
526
527
    :param str src, tar: two strings to be compared
528
    :param bool allow_different_lengths:
529
        If True (default), this returns the Hamming distance for those
530
        characters that have a matching character in both strings plus the
531
        difference in the strings' lengths. This is equivalent to extending
532
        the shorter string with obligatorily non-matching characters.
533
        If False, an exception is raised in the case of strings of unequal
534
        lengths.
535
    :returns: normalized Hamming similarity
536
    :rtype: float
537
538
    >>> round(sim_hamming('cat', 'hat'), 12)
539
    0.666666666667
540
    >>> sim_hamming('Niall', 'Neil')
541
    0.4
542
    >>> sim_hamming('aluminum', 'Catalan')
543
    0.0
544
    >>> sim_hamming('ATCG', 'TAGC')
545
    0.0
546
    """
547
    return 1 - dist_hamming(src, tar, difflens)
548
549
550
def _get_qgrams(src, tar, qval=None, skip=0):
551
    """Return the Q-Grams in src & tar.
552
553
    :param str src, tar: two strings to be compared
554
        (or QGrams/Counter objects)
555
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
556
        version
557
    :param int skip: the number of characters to skip (only works when
558
        src and tar are strings
559
    :return: Q-Grams
560
    """
561
    if isinstance(src, Counter) and isinstance(tar, Counter):
562
        return src, tar
563
    if qval and qval > 0:
564
        return (QGrams(src, qval, '$#', skip),
565
                QGrams(tar, qval, '$#', skip))
566
    return Counter(src.strip().split()), Counter(tar.strip().split())
567
568
569
def sim_tversky(src, tar, qval=2, alpha=1, beta=1, bias=None):
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
570
    r"""Return the Tversky index of two strings.
571
572
    The Tversky index :cite:`Tversky:1977` is defined as:
573
    For two sets X and Y:
574
    :math:`sim_{Tversky}(X, Y) = \\frac{|X \\cap Y|}
575
    {|X \\cap Y| + \\alpha|X - Y| + \\beta|Y - X|}`.
576
577
    :math:`\\alpha = \\beta = 1` is equivalent to the Jaccard & Tanimoto
578
    similarity coefficients.
579
580
    :math:`\\alpha = \\beta = 0.5` is equivalent to the Sørensen-Dice
581
    similarity coefficient :cite:`Dice:1945,Sorensen:1948`.
582
583
    Unequal α and β will tend to emphasize one or the other set's
584
    contributions:
585
586
        - :math:`\\alpha > \\beta` emphasizes the contributions of X over Y
587
        - :math:`\\alpha < \\beta` emphasizes the contributions of Y over X)
588
589
    Parameter values' relation to 1 emphasizes different types of
590
    contributions:
591
592
        - :math:`\\alpha and \\beta > 1` emphsize unique contributions over the
593
          intersection
594
        - :math:`\\alpha and \\beta < 1` emphsize the intersection over unique
595
          contributions
596
597
    The symmetric variant is defined in :cite:`Jiminez:2013`. This is activated
598
    by specifying a bias parameter.
599
600
    :param str src, tar: two strings to be compared
601
        (or QGrams/Counter objects)
602
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
603
        version
604
    :param float alpha, beta: two Tversky index parameters as indicated in the
605
        description below
606
    :returns: Tversky similarity
607
    :rtype: float
608
609
    >>> sim_tversky('cat', 'hat')
610
    0.3333333333333333
611
    >>> sim_tversky('Niall', 'Neil')
612
    0.2222222222222222
613
    >>> sim_tversky('aluminum', 'Catalan')
614
    0.0625
615
    >>> sim_tversky('ATCG', 'TAGC')
616
    0.0
617
    """
618
    if alpha < 0 or beta < 0:
619
        raise ValueError('Unsupported weight assignment; alpha and beta ' +
620
                         'must be greater than or equal to 0.')
621
622
    if src == tar:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
623
        return 1.0
624
    elif not src or not tar:
625
        return 0.0
626
627
    q_src, q_tar = _get_qgrams(src, tar, qval)
628
    q_src_mag = sum(q_src.values())
629
    q_tar_mag = sum(q_tar.values())
630
    q_intersection_mag = sum((q_src & q_tar).values())
631
632
    if not q_src or not q_tar:
633
        return 0.0
634
635
    if bias is None:
636
        return q_intersection_mag / (q_intersection_mag + alpha *
637
                                     (q_src_mag - q_intersection_mag) +
638
                                     beta * (q_tar_mag - q_intersection_mag))
639
640
    a_val = min(q_src_mag - q_intersection_mag,
641
                q_tar_mag - q_intersection_mag)
642
    b_val = max(q_src_mag - q_intersection_mag,
643
                q_tar_mag - q_intersection_mag)
644
    c_val = q_intersection_mag + bias
645
    return c_val / (beta * (alpha * a_val + (1 - alpha) * b_val) + c_val)
646
647
648
def dist_tversky(src, tar, qval=2, alpha=1, beta=1, bias=None):
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
649
    """Return the Tverssky distance between two strings.
650
651
    Tversky distance is the complement of the Tvesrsky index (similarity):
652
    :math:`dist_{Tversky} = 1-sim_{Tversky}`.
653
654
    :param str src, tar: two strings to be compared
655
        (or QGrams/Counter objects)
656
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
657
        version
658
    :param float alpha, beta: two Tversky index parameters as indicated in the
659
        description below
660
    :returns: Tversky distance
661
    :rtype: float
662
663
    >>> dist_tversky('cat', 'hat')
664
    0.6666666666666667
665
    >>> dist_tversky('Niall', 'Neil')
666
    0.7777777777777778
667
    >>> dist_tversky('aluminum', 'Catalan')
668
    0.9375
669
    >>> dist_tversky('ATCG', 'TAGC')
670
    1.0
671
    """
672
    return 1 - sim_tversky(src, tar, qval, alpha, beta, bias)
673
674
675
def sim_dice(src, tar, qval=2):
676
    r"""Return the Sørensen–Dice coefficient of two strings.
677
678
    For two sets X and Y, the Sørensen–Dice coefficient
679
    :cite:`Dice:1945,Sorensen:1948` is
680
    :math:`sim_{dice}(X, Y) = \\frac{2 \\cdot |X \\cap Y|}{|X| + |Y|}`.
681
682
    This is identical to the Tanimoto similarity coefficient
683
    :cite:`Tanimoto:1958` and the Tversky index :cite:`Tversky:1977` for
684
    :math:`\\alpha = \\beta = 0.5`.
685
686
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
687
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
688
        version
689
    :returns: Sørensen–Dice similarity
690
    :rtype: float
691
692
    >>> sim_dice('cat', 'hat')
693
    0.5
694
    >>> sim_dice('Niall', 'Neil')
695
    0.36363636363636365
696
    >>> sim_dice('aluminum', 'Catalan')
697
    0.11764705882352941
698
    >>> sim_dice('ATCG', 'TAGC')
699
    0.0
700
    """
701
    return sim_tversky(src, tar, qval, 0.5, 0.5)
702
703
704
def dist_dice(src, tar, qval=2):
705
    """Return the Sørensen–Dice distance between two strings.
706
707
    Sørensen–Dice distance is the complemenjt of the Sørensen–Dice coefficient:
708
    :math:`dist_{dice} = 1 - sim_{dice}`.
709
710
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
711
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
712
        version
713
    :returns: Sørensen–Dice distance
714
    :rtype: float
715
716
    >>> dist_dice('cat', 'hat')
717
    0.5
718
    >>> dist_dice('Niall', 'Neil')
719
    0.6363636363636364
720
    >>> dist_dice('aluminum', 'Catalan')
721
    0.8823529411764706
722
    >>> dist_dice('ATCG', 'TAGC')
723
    1.0
724
    """
725
    return 1 - sim_dice(src, tar, qval)
726
727
728
def sim_jaccard(src, tar, qval=2):
729
    r"""Return the Jaccard similarity of two strings.
730
731
    For two sets X and Y, the Jaccard similarity coefficient
732
    :cite:`Jaccard:1901` is :math:`sim_{jaccard}(X, Y) =
733
    \\frac{|X \\cap Y|}{|X \\cup Y|}`.
734
735
    This is identical to the Tanimoto similarity coefficient
736
    :cite:`Tanimoto:1958`
737
    and the Tversky index :cite:`Tversky:1977` for
738
    :math:`\\alpha = \\beta = 1`.
739
740
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
741
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
742
        version
743
    :returns: Jaccard similarity
744
    :rtype: float
745
746
    >>> sim_jaccard('cat', 'hat')
747
    0.3333333333333333
748
    >>> sim_jaccard('Niall', 'Neil')
749
    0.2222222222222222
750
    >>> sim_jaccard('aluminum', 'Catalan')
751
    0.0625
752
    >>> sim_jaccard('ATCG', 'TAGC')
753
    0.0
754
    """
755
    return sim_tversky(src, tar, qval, 1, 1)
756
757
758
def dist_jaccard(src, tar, qval=2):
759
    """Return the Jaccard distance between two strings.
760
761
    Jaccard distance is the complement of the Jaccard similarity coefficient:
762
    :math:`dist_{Jaccard} = 1 - sim_{Jaccard}`.
763
764
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
765
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
766
        version
767
    :returns: Jaccard distance
768
    :rtype: float
769
770
    >>> dist_jaccard('cat', 'hat')
771
    0.6666666666666667
772
    >>> dist_jaccard('Niall', 'Neil')
773
    0.7777777777777778
774
    >>> dist_jaccard('aluminum', 'Catalan')
775
    0.9375
776
    >>> dist_jaccard('ATCG', 'TAGC')
777
    1.0
778
    """
779
    return 1 - sim_jaccard(src, tar, qval)
780
781
782
def sim_overlap(src, tar, qval=2):
783
    r"""Return the overlap coefficient of two strings.
784
785
    For two sets X and Y, the overlap coefficient
786
    :cite:`Szymkiewicz:1934,Simpson:1949`, also called the
787
    Szymkiewicz-Simpson coefficient, is
788
    :math:`sim_{overlap}(X, Y) = \\frac{|X \\cap Y|}{min(|X|, |Y|)}`.
789
790
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
791
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
792
        version
793
    :returns: overlap similarity
794
    :rtype: float
795
796
    >>> sim_overlap('cat', 'hat')
797
    0.5
798
    >>> sim_overlap('Niall', 'Neil')
799
    0.4
800
    >>> sim_overlap('aluminum', 'Catalan')
801
    0.125
802
    >>> sim_overlap('ATCG', 'TAGC')
803
    0.0
804
    """
805
    if src == tar:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
806
        return 1.0
807
    elif not src or not tar:
808
        return 0.0
809
810
    q_src, q_tar = _get_qgrams(src, tar, qval)
811
    q_src_mag = sum(q_src.values())
812
    q_tar_mag = sum(q_tar.values())
813
    q_intersection_mag = sum((q_src & q_tar).values())
814
815
    return q_intersection_mag / min(q_src_mag, q_tar_mag)
816
817
818
def dist_overlap(src, tar, qval=2):
819
    """Return the overlap distance between two strings.
820
821
    Overlap distance is the complement of the overlap coefficient:
822
    :math:`sim_{overlap} = 1 - dist_{overlap}`.
823
824
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
825
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
826
        version
827
    :returns: overlap distance
828
    :rtype: float
829
830
    >>> dist_overlap('cat', 'hat')
831
    0.5
832
    >>> dist_overlap('Niall', 'Neil')
833
    0.6
834
    >>> dist_overlap('aluminum', 'Catalan')
835
    0.875
836
    >>> dist_overlap('ATCG', 'TAGC')
837
    1.0
838
    """
839
    return 1 - sim_overlap(src, tar, qval)
840
841
842
def sim_tanimoto(src, tar, qval=2):
843
    r"""Return the Tanimoto similarity of two strings.
844
845
    For two sets X and Y, the Tanimoto similarity coefficient
846
    :cite:`Tanimoto:1958` is
847
    :math:`sim_{Tanimoto}(X, Y) = \\frac{|X \\cap Y|}{|X \\cup Y|}`.
848
849
    This is identical to the Jaccard similarity coefficient
850
    :cite:`Jaccard:1901` and the Tversky index :cite:`Tversky:1977` for
851
    :math:`\\alpha = \\beta = 1`.
852
853
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
854
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
855
        version
856
    :returns: Tanimoto similarity
857
    :rtype: float
858
859
    >>> sim_tanimoto('cat', 'hat')
860
    0.3333333333333333
861
    >>> sim_tanimoto('Niall', 'Neil')
862
    0.2222222222222222
863
    >>> sim_tanimoto('aluminum', 'Catalan')
864
    0.0625
865
    >>> sim_tanimoto('ATCG', 'TAGC')
866
    0.0
867
    """
868
    return sim_jaccard(src, tar, qval)
869
870
871
def tanimoto(src, tar, qval=2):
872
    """Return the Tanimoto distance between two strings.
873
874
    Tanimoto distance is :math:`-log_{2}sim_{Tanimoto}`.
875
876
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
877
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
878
        version
879
    :returns: Tanimoto distance
880
    :rtype: float
881
882
    >>> tanimoto('cat', 'hat')
883
    -1.5849625007211563
884
    >>> tanimoto('Niall', 'Neil')
885
    -2.1699250014423126
886
    >>> tanimoto('aluminum', 'Catalan')
887
    -4.0
888
    >>> tanimoto('ATCG', 'TAGC')
889
    -inf
890
    """
891
    coeff = sim_jaccard(src, tar, qval)
892
    if coeff != 0:
893
        return log(coeff, 2)
894
895
    return float('-inf')
896
897
898
def minkowski(src, tar, qval=2, pval=1, normalize=False):
0 ignored issues
show
Comprehensibility Bug introduced by
normalize is re-defining a name which is already available in the outer-scope (previously defined on line 74).

It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior:

param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
Loading history...
899
    """Return the Minkowski distance (:math:`L^p-norm`) of two strings.
900
901
    The Minkowsky distance :cite:`Minkowski:1910` is a distance metric in
902
    :math:`L^p-space`.
903
904
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
905
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
906
        version
907
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
908
    :param normalize: normalizes to [0, 1] if True
909
    :returns: the Minkowski distance
910
    :rtype: float
911
    """
912
    q_src, q_tar = _get_qgrams(src, tar, qval)
913
    diffs = ((q_src - q_tar) + (q_tar - q_src)).values()
914
915
    normalizer = 1
916
    if normalize:
917
        totals = (q_src + q_tar).values()
918
        if pval == 0:
919
            normalizer = len(totals)
920
        else:
921
            normalizer = sum(_**pval for _ in totals)**(1 / pval)
922
923
    if pval == float('inf'):
924
        # Chebyshev distance
925
        return max(diffs)/normalizer
926
    if pval == 0:
927
        # This is the l_0 "norm" as developed by David Donoho
928
        return len(diffs)
929
    return sum(_**pval for _ in diffs)**(1 / pval)/normalizer
930
931
932
def dist_minkowski(src, tar, qval=2, pval=1):
933
    """Return Minkowski distance of two strings, normalized to [0, 1].
934
935
    The normalized Minkowsky distance :cite:`Minkowski:1910` is a distance
936
    metric in :math:`L^p-space`, normalized to [0, 1].
937
938
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
939
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
940
        version
941
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
942
    :returns: the normalized Minkowski distance
943
    :rtype: float
944
    """
945
    return minkowski(src, tar, qval, pval, True)
946
947
948
def sim_minkowski(src, tar, qval=2, pval=1):
949
    """Return Minkowski similarity of two strings, normalized to [0, 1].
950
951
    Minkowski similarity is the complement of Minkowski distance:
952
    :math:`sim_{Minkowski} = 1 - dist_{Minkowski}`.
953
954
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
955
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
956
        version
957
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
958
    :returns: the normalized Minkowski similarity
959
    :rtype: float
960
    """
961
    return 1-minkowski(src, tar, qval, pval, True)
962
963
964
def manhattan(src, tar, qval=2, normalize=False):
0 ignored issues
show
Comprehensibility Bug introduced by
normalize is re-defining a name which is already available in the outer-scope (previously defined on line 74).

It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior:

param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
Loading history...
965
    """Return the Manhattan distance between two strings.
966
967
    Manhattan distance is the city-block or taxi-cab distance, equivalent
968
    to Minkowski distance in :math:`L^1`-space.
969
970
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
971
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
972
        version
973
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
974
    :param normalize: normalizes to [0, 1] if True
975
    :returns: the Manhattan distance
976
    :rtype: float
977
    """
978
    return minkowski(src, tar, qval, 1, normalize)
979
980
981
def dist_manhattan(src, tar, qval=2):
982
    """Return the Manhattan distance between two strings, normalized to [0, 1].
983
984
    The normalized Manhattan distance is a distance
985
    metric in :math:`L^1-space`, normalized to [0, 1].
986
987
    This is identical to Canberra distance.
988
989
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
990
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
991
        version
992
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
993
    :returns: the normalized Manhattan distance
994
    :rtype: float
995
    """
996
    return manhattan(src, tar, qval, 1, True)
0 ignored issues
show
Bug introduced by
There seem to be too many positional arguments for this function call.
Loading history...
997
998
999
def sim_manhattan(src, tar, qval=2):
1000
    """Return the Manhattan similarity of two strings, normalized to [0, 1].
1001
1002
    Manhattan similarity is the complement of Manhattan distance:
1003
    :math:`sim_{Manhattan} = 1 - dist_{Manhattan}`.
1004
1005
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1006
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1007
        version
1008
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
1009
    :returns: the normalized Manhattan similarity
1010
    :rtype: float
1011
    """
1012
    return 1-manhattan(src, tar, qval, 1, True)
0 ignored issues
show
Bug introduced by
There seem to be too many positional arguments for this function call.
Loading history...
1013
1014
1015
def euclidean(src, tar, qval=2, normalize=False):
0 ignored issues
show
Comprehensibility Bug introduced by
normalize is re-defining a name which is already available in the outer-scope (previously defined on line 74).

It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior:

param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
Loading history...
1016
    """Return the Euclidean distance between two strings.
1017
1018
    Euclidean distance is the straigh-line or as-the-crow-flies distance,
1019
    equivalent to Minkowski distance in :math:`L^2`-space.
1020
1021
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1022
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1023
        version
1024
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
1025
    :param normalize: normalizes to [0, 1] if True
1026
    :returns: the Euclidean distance
1027
    :rtype: float
1028
    """
1029
    return minkowski(src, tar, qval, 2, normalize)
1030
1031
1032
def dist_euclidean(src, tar, qval=2):
1033
    """Return the Euclidean distance between two strings, normalized to [0, 1].
1034
1035
    The normalized Euclidean distance is a distance
1036
    metric in :math:`L^2-space`, normalized to [0, 1].
1037
1038
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1039
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1040
        version
1041
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
1042
    :returns: the normalized Euclidean distance
1043
    :rtype: float
1044
    """
1045
    return euclidean(src, tar, qval, True)
1046
1047
1048
def sim_euclidean(src, tar, qval=2):
1049
    """Return the Euclidean similarity of two strings, normalized to [0, 1].
1050
1051
    Euclidean similarity is the complement of Euclidean distance:
1052
    :math:`sim_{Euclidean} = 1 - dist_{Euclidean}`.
1053
1054
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1055
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1056
        version
1057
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
1058
    :returns: the normalized Euclidean similarity
1059
    :rtype: float
1060
    """
1061
    return 1-euclidean(src, tar, qval, True)
1062
1063
1064
def chebyshev(src, tar, qval=2, normalize=False):
0 ignored issues
show
Comprehensibility Bug introduced by
normalize is re-defining a name which is already available in the outer-scope (previously defined on line 74).

It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior:

param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
Loading history...
1065
    r"""Return the Chebyshev distance between two strings.
1066
1067
    Euclidean distance is the chessboard distance,
1068
    equivalent to Minkowski distance in :math:`L^\infty`-space.
1069
1070
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1071
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1072
        version
1073
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
1074
    :param normalize: normalizes to [0, 1] if True
1075
    :returns: the Chebyshev distance
1076
    :rtype: float
1077
    """
1078
    return minkowski(src, tar, qval, float('inf'), normalize)
1079
1080
1081
def dist_chebyshev(src, tar, qval=2):
1082
    """Return the Chebyshev distance between two strings, normalized to [0, 1].
1083
1084
    The normalized Chebyshev distance :cite:`Minkowski:1910` is a distance
1085
    metric in :math:`L^p-space`, normalized to [0, 1].
1086
1087
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1088
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1089
        version
1090
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
1091
    :returns: the normalized Chebyshev distance
1092
    :rtype: float
1093
    """
1094
    return chebyshev(src, tar, qval, True)
1095
1096
1097
def sim_chebyshev(src, tar, qval=2):
1098
    """Return the Chebyshev similarity of two strings, normalized to [0, 1].
1099
1100
    Chebyshev similarity is the complement of Chebyshev distance:
1101
    :math:`sim_{Chebyshev} = 1 - dist_{Chebyshev}`.
1102
1103
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1104
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1105
        version
1106
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
1107
    :returns: the normalized Chebyshev similarity
1108
    :rtype: float
1109
    """
1110
    return 1 - chebyshev(src, tar, qval, True)
1111
1112
1113
def sim_cosine(src, tar, qval=2):
1114
    r"""Return the cosine similarity of two strings.
1115
1116
    For two sets X and Y, the cosine similarity, Otsuka-Ochiai coefficient, or
1117
    Ochiai coefficient :cite:`Otsuka:1936,Ochiai:1957` is:
1118
    :math:`sim_{cosine}(X, Y) = \\frac{|X \\cap Y|}{\\sqrt{|X| \\cdot |Y|}}`.
1119
1120
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1121
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1122
        version
1123
    :returns: cosine similarity
1124
    :rtype: float
1125
1126
    >>> sim_cosine('cat', 'hat')
1127
    0.5
1128
    >>> sim_cosine('Niall', 'Neil')
1129
    0.3651483716701107
1130
    >>> sim_cosine('aluminum', 'Catalan')
1131
    0.11785113019775793
1132
    >>> sim_cosine('ATCG', 'TAGC')
1133
    0.0
1134
    """
1135
    if src == tar:
1136
        return 1.0
1137
    if not src or not tar:
1138
        return 0.0
1139
1140
    q_src, q_tar = _get_qgrams(src, tar, qval)
1141
    q_src_mag = sum(q_src.values())
1142
    q_tar_mag = sum(q_tar.values())
1143
    q_intersection_mag = sum((q_src & q_tar).values())
1144
1145
    return q_intersection_mag / sqrt(q_src_mag * q_tar_mag)
1146
1147
1148
def dist_cosine(src, tar, qval=2):
1149
    """Return the cosine distance between two strings.
1150
1151
    Cosine distance is the complement of cosine similarity:
1152
    :math:`dist_{cosine} = 1 - sim_{cosine}`.
1153
1154
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1155
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1156
        version
1157
    :returns: cosine distance
1158
    :rtype: float
1159
1160
    >>> dist_cosine('cat', 'hat')
1161
    0.5
1162
    >>> dist_cosine('Niall', 'Neil')
1163
    0.6348516283298893
1164
    >>> dist_cosine('aluminum', 'Catalan')
1165
    0.882148869802242
1166
    >>> dist_cosine('ATCG', 'TAGC')
1167
    1.0
1168
    """
1169
    return 1 - sim_cosine(src, tar, qval)
1170
1171
1172
def sim_strcmp95(src, tar, long_strings=False):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (23/15).
Loading history...
1173
    """Return the strcmp95 similarity of two strings.
1174
1175
    This is a Python translation of the C code for strcmp95:
1176
    http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
1177
    :cite:`Winkler:1994`.
1178
    The above file is a US Government publication and, accordingly,
1179
    in the public domain.
1180
1181
    This is based on the Jaro-Winkler distance, but also attempts to correct
1182
    for some common typos and frequently confused characters. It is also
1183
    limited to uppercase ASCII characters, so it is appropriate to American
1184
    names, but not much else.
1185
1186
    :param str src, tar: two strings to be compared
1187
    :param bool long_strings: set to True to "Increase the probability of a
1188
        match when the number of matched characters is large.  This option
1189
        allows for a little more tolerance when the strings are large. It is
1190
        not an appropriate test when comparing fixed length fields such as
1191
        phone and social security numbers."
1192
    :returns: strcmp95 similarity
1193
    :rtype: float
1194
1195
    >>> sim_strcmp95('cat', 'hat')
1196
    0.7777777777777777
1197
    >>> sim_strcmp95('Niall', 'Neil')
1198
    0.8454999999999999
1199
    >>> sim_strcmp95('aluminum', 'Catalan')
1200
    0.6547619047619048
1201
    >>> sim_strcmp95('ATCG', 'TAGC')
1202
    0.8333333333333334
1203
    """
1204
    def _inrange(char):
1205
        """Return True if char is in the range (0, 91)."""
1206
        return ord(char) > 0 and ord(char) < 91
1207
1208
    ying = src.strip().upper()
1209
    yang = tar.strip().upper()
1210
1211
    if ying == yang:
1212
        return 1.0
1213
    # If either string is blank - return - added in Version 2
1214
    if not ying or not yang:
1215
        return 0.0
1216
1217
    adjwt = defaultdict(int)
1218
    sp_mx = (
1219
        ('A', 'E'), ('A', 'I'), ('A', 'O'), ('A', 'U'), ('B', 'V'), ('E', 'I'),
1220
        ('E', 'O'), ('E', 'U'), ('I', 'O'), ('I', 'U'), ('O', 'U'), ('I', 'Y'),
1221
        ('E', 'Y'), ('C', 'G'), ('E', 'F'), ('W', 'U'), ('W', 'V'), ('X', 'K'),
1222
        ('S', 'Z'), ('X', 'S'), ('Q', 'C'), ('U', 'V'), ('M', 'N'), ('L', 'I'),
1223
        ('Q', 'O'), ('P', 'R'), ('I', 'J'), ('2', 'Z'), ('5', 'S'), ('8', 'B'),
1224
        ('1', 'I'), ('1', 'L'), ('0', 'O'), ('0', 'Q'), ('C', 'K'), ('G', 'J')
1225
    )
1226
1227
    # Initialize the adjwt array on the first call to the function only.
1228
    # The adjwt array is used to give partial credit for characters that
1229
    # may be errors due to known phonetic or character recognition errors.
1230
    # A typical example is to match the letter "O" with the number "0"
1231
    for i in sp_mx:
1232
        adjwt[(i[0], i[1])] = 3
1233
        adjwt[(i[1], i[0])] = 3
1234
1235
    if len(ying) > len(yang):
1236
        search_range = len(ying)
1237
        minv = len(yang)
1238
    else:
1239
        search_range = len(yang)
1240
        minv = len(ying)
1241
1242
    # Blank out the flags
1243
    ying_flag = [0] * search_range
1244
    yang_flag = [0] * search_range
1245
    search_range = max(0, search_range // 2 - 1)
1246
1247
    # Looking only within the search range, count and flag the matched pairs.
1248
    num_com = 0
1249
    yl1 = len(yang) - 1
1250
    for i in range(len(ying)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
1251
        lowlim = (i - search_range) if (i >= search_range) else 0
1252
        hilim = (i + search_range) if ((i + search_range) <= yl1) else yl1
1253
        for j in range(lowlim, hilim+1):
1254
            if (yang_flag[j] == 0) and (yang[j] == ying[i]):
1255
                yang_flag[j] = 1
1256
                ying_flag[i] = 1
1257
                num_com += 1
1258
                break
1259
1260
    # If no characters in common - return
1261
    if num_com == 0:
1262
        return 0.0
1263
1264
    # Count the number of transpositions
1265
    k = n_trans = 0
1266
    for i in range(len(ying)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
1267
        if ying_flag[i] != 0:
1268
            for j in range(k, len(yang)):
1269
                if yang_flag[j] != 0:
1270
                    k = j + 1
1271
                    break
1272
            if ying[i] != yang[j]:
0 ignored issues
show
introduced by
The variable j does not seem to be defined for all execution paths.
Loading history...
1273
                n_trans += 1
1274
    n_trans = n_trans // 2
1275
1276
    # Adjust for similarities in unmatched characters
1277
    n_simi = 0
1278
    if minv > num_com:
0 ignored issues
show
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
1279
        for i in range(len(ying)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
1280
            if ying_flag[i] == 0 and _inrange(ying[i]):
1281
                for j in range(len(yang)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
1282
                    if yang_flag[j] == 0 and _inrange(yang[j]):
1283
                        if (ying[i], yang[j]) in adjwt:
1284
                            n_simi += adjwt[(ying[i], yang[j])]
1285
                            yang_flag[j] = 2
1286
                            break
1287
    num_sim = n_simi/10.0 + num_com
1288
1289
    # Main weight computation
1290
    weight = num_sim / len(ying) + num_sim / len(yang) + \
1291
        (num_com - n_trans) / num_com
1292
    weight = weight / 3.0
1293
1294
    # Continue to boost the weight if the strings are similar
1295
    if weight > 0.7:
1296
1297
        # Adjust for having up to the first 4 characters in common
1298
        j = 4 if (minv >= 4) else minv
1299
        i = 0
1300
        while (i < j) and (ying[i] == yang[i]) and (not ying[i].isdigit()):
1301
            i += 1
1302
        if i:
1303
            weight += i * 0.1 * (1.0 - weight)
1304
1305
        # Optionally adjust for long strings.
1306
1307
        # After agreeing beginning chars, at least two more must agree and
1308
        # the agreeing characters must be > .5 of remaining characters.
1309
        if (((long_strings) and (minv > 4) and (num_com > i+1) and
1310
             (2*num_com >= minv+i))):
1311
            if not ying[0].isdigit():
1312
                weight += (1.0-weight) * ((num_com-i-1) /
1313
                                          (len(ying)+len(yang)-i*2+2))
1314
1315
    return weight
1316
1317
1318
def dist_strcmp95(src, tar, long_strings=False):
1319
    """Return the strcmp95 distance between two strings.
1320
1321
    strcmp95 distance is the complement of strcmp95 similarity:
1322
    :math:`dist_{strcmp95} = 1 - sim_{strcmp95}`.
1323
1324
    :param str src, tar: two strings to be compared
1325
    :param bool long_strings: set to True to "Increase the probability of a
1326
        match when the number of matched characters is large.  This option
1327
        allows for a little more tolerance when the strings are large. It is
1328
        not an appropriate test when comparing fixed length fields such as
1329
        phone and social security numbers."
1330
    :returns: strcmp95 distance
1331
    :rtype: float
1332
1333
    >>> round(dist_strcmp95('cat', 'hat'), 12)
1334
    0.222222222222
1335
    >>> round(dist_strcmp95('Niall', 'Neil'), 12)
1336
    0.1545
1337
    >>> round(dist_strcmp95('aluminum', 'Catalan'), 12)
1338
    0.345238095238
1339
    >>> round(dist_strcmp95('ATCG', 'TAGC'), 12)
1340
    0.166666666667
1341
    """
1342
    return 1 - sim_strcmp95(src, tar, long_strings)
1343
1344
1345
def sim_jaro_winkler(src, tar, qval=1, mode='winkler', long_strings=False,
0 ignored issues
show
best-practice introduced by
Too many arguments (7/5)
Loading history...
Comprehensibility introduced by
This function exceeds the maximum number of variables (22/15).
Loading history...
1346
                     boost_threshold=0.7, scaling_factor=0.1):
1347
    """Return the Jaro or Jaro-Winkler similarity of two strings.
1348
1349
    Jaro(-Winkler) distance is a string edit distance initially proposed by
1350
    Jaro and extended by Winkler :cite:`Jaro:1989,Winkler:1990`.
1351
1352
    This is Python based on the C code for strcmp95:
1353
    http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
1354
    :cite:`Winkler:1994`. The above file is a US Government publication and,
1355
    accordingly, in the public domain.
1356
1357
    :param str src, tar: two strings to be compared
1358
    :param int qval: the length of each q-gram (defaults to 1: character-wise
1359
        matching)
1360
    :param str mode: indicates which variant of this distance metric to
1361
        compute:
1362
1363
            - 'winkler' -- computes the Jaro-Winkler distance (default) which
1364
              increases the score for matches near the start of the word
1365
            - 'jaro' -- computes the Jaro distance
1366
1367
    The following arguments apply only when mode is 'winkler':
1368
1369
    :param bool long_strings: set to True to "Increase the probability of a
1370
        match when the number of matched characters is large.  This option
1371
        allows for a little more tolerance when the strings are large.  It is
1372
        not an appropriate test when comparing fixed length fields such as
1373
        phone and social security numbers."
1374
    :param float boost_threshold: a value between 0 and 1, below which the
1375
        Winkler boost is not applied (defaults to 0.7)
1376
    :param float scaling_factor: a value between 0 and 0.25, indicating by how
1377
        much to boost scores for matching prefixes (defaults to 0.1)
1378
1379
    :returns: Jaro or Jaro-Winkler similarity
1380
    :rtype: float
1381
1382
    >>> round(sim_jaro_winkler('cat', 'hat'), 12)
1383
    0.777777777778
1384
    >>> round(sim_jaro_winkler('Niall', 'Neil'), 12)
1385
    0.805
1386
    >>> round(sim_jaro_winkler('aluminum', 'Catalan'), 12)
1387
    0.60119047619
1388
    >>> round(sim_jaro_winkler('ATCG', 'TAGC'), 12)
1389
    0.833333333333
1390
1391
    >>> round(sim_jaro_winkler('cat', 'hat', mode='jaro'), 12)
1392
    0.777777777778
1393
    >>> round(sim_jaro_winkler('Niall', 'Neil', mode='jaro'), 12)
1394
    0.783333333333
1395
    >>> round(sim_jaro_winkler('aluminum', 'Catalan', mode='jaro'), 12)
1396
    0.60119047619
1397
    >>> round(sim_jaro_winkler('ATCG', 'TAGC', mode='jaro'), 12)
1398
    0.833333333333
1399
    """
1400
    if mode == 'winkler':
1401
        if boost_threshold > 1 or boost_threshold < 0:
1402
            raise ValueError('Unsupported boost_threshold assignment; ' +
1403
                             'boost_threshold must be between 0 and 1.')
1404
        if scaling_factor > 0.25 or scaling_factor < 0:
1405
            raise ValueError('Unsupported scaling_factor assignment; ' +
1406
                             'scaling_factor must be between 0 and 0.25.')
1407
1408
    if src == tar:
1409
        return 1.0
1410
1411
    src = QGrams(src.strip(), qval).ordered_list
1412
    tar = QGrams(tar.strip(), qval).ordered_list
1413
1414
    lens = len(src)
1415
    lent = len(tar)
1416
1417
    # If either string is blank - return - added in Version 2
1418
    if lens == 0 or lent == 0:
1419
        return 0.0
1420
1421
    if lens > lent:
1422
        search_range = lens
1423
        minv = lent
1424
    else:
1425
        search_range = lent
1426
        minv = lens
1427
1428
    # Zero out the flags
1429
    src_flag = [0] * search_range
1430
    tar_flag = [0] * search_range
1431
    search_range = max(0, search_range//2 - 1)
1432
1433
    # Looking only within the search range, count and flag the matched pairs.
1434
    num_com = 0
1435
    yl1 = lent - 1
1436
    for i in range(lens):
1437
        lowlim = (i - search_range) if (i >= search_range) else 0
1438
        hilim = (i + search_range) if ((i + search_range) <= yl1) else yl1
1439
        for j in range(lowlim, hilim+1):
1440
            if (tar_flag[j] == 0) and (tar[j] == src[i]):
1441
                tar_flag[j] = 1
1442
                src_flag[i] = 1
1443
                num_com += 1
1444
                break
1445
1446
    # If no characters in common - return
1447
    if num_com == 0:
1448
        return 0.0
1449
1450
    # Count the number of transpositions
1451
    k = n_trans = 0
1452
    for i in range(lens):
1453
        if src_flag[i] != 0:
1454
            for j in range(k, lent):
1455
                if tar_flag[j] != 0:
1456
                    k = j + 1
1457
                    break
1458
            if src[i] != tar[j]:
0 ignored issues
show
introduced by
The variable j does not seem to be defined for all execution paths.
Loading history...
1459
                n_trans += 1
1460
    n_trans = n_trans // 2
1461
1462
    # Main weight computation for Jaro distance
1463
    weight = num_com / lens + num_com / lent + (num_com - n_trans) / num_com
1464
    weight = weight / 3.0
1465
1466
    # Continue to boost the weight if the strings are similar
1467
    # This is the Winkler portion of Jaro-Winkler distance
1468
    if mode == 'winkler' and weight > boost_threshold:
1469
1470
        # Adjust for having up to the first 4 characters in common
1471
        j = 4 if (minv >= 4) else minv
1472
        i = 0
1473
        while (i < j) and (src[i] == tar[i]):
1474
            i += 1
1475
        if i:
1476
            weight += i * scaling_factor * (1.0 - weight)
1477
1478
        # Optionally adjust for long strings.
1479
1480
        # After agreeing beginning chars, at least two more must agree and
1481
        # the agreeing characters must be > .5 of remaining characters.
1482
        if (((long_strings) and (minv > 4) and (num_com > i+1) and
1483
             (2*num_com >= minv+i))):
1484
            weight += (1.0-weight) * ((num_com-i-1) / (lens+lent-i*2+2))
1485
1486
    return weight
1487
1488
1489
def dist_jaro_winkler(src, tar, qval=1, mode='winkler', long_strings=False,
0 ignored issues
show
best-practice introduced by
Too many arguments (7/5)
Loading history...
1490
                      boost_threshold=0.7, scaling_factor=0.1):
1491
    """Return the Jaro or Jaro-Winkler distance between two strings.
1492
1493
    Jaro(-Winkler) similarity is the complement of Jaro(-Winkler) distance:
1494
    :math:`sim_{Jaro(-Winkler)} = 1 - dist_{Jaro(-Winkler)}`.
1495
1496
    :param str src, tar: two strings to be compared
1497
    :param int qval: the length of each q-gram (defaults to 1: character-wise
1498
        matching)
1499
    :param str mode: indicates which variant of this distance metric to
1500
        compute:
1501
1502
            - 'winkler' -- computes the Jaro-Winkler distance (default) which
1503
              increases the score for matches near the start of the word
1504
            - 'jaro' -- computes the Jaro distance
1505
1506
    The following arguments apply only when mode is 'winkler':
1507
1508
    :param bool long_strings: set to True to "Increase the probability of a
1509
        match when the number of matched characters is large.  This option
1510
        allows for a little more tolerance when the strings are large.  It is
1511
        not an appropriate test when comparing fixed length fields such as
1512
        phone and social security numbers."
1513
    :param float boost_threshold: a value between 0 and 1, below which the
1514
        Winkler boost is not applied (defaults to 0.7)
1515
    :param float scaling_factor: a value between 0 and 0.25, indicating by how
1516
        much to boost scores for matching prefixes (defaults to 0.1)
1517
1518
    :returns: Jaro or Jaro-Winkler distance
1519
    :rtype: float
1520
1521
    >>> round(dist_jaro_winkler('cat', 'hat'), 12)
1522
    0.222222222222
1523
    >>> round(dist_jaro_winkler('Niall', 'Neil'), 12)
1524
    0.195
1525
    >>> round(dist_jaro_winkler('aluminum', 'Catalan'), 12)
1526
    0.39880952381
1527
    >>> round(dist_jaro_winkler('ATCG', 'TAGC'), 12)
1528
    0.166666666667
1529
1530
    >>> round(dist_jaro_winkler('cat', 'hat', mode='jaro'), 12)
1531
    0.222222222222
1532
    >>> round(dist_jaro_winkler('Niall', 'Neil', mode='jaro'), 12)
1533
    0.216666666667
1534
    >>> round(dist_jaro_winkler('aluminum', 'Catalan', mode='jaro'), 12)
1535
    0.39880952381
1536
    >>> round(dist_jaro_winkler('ATCG', 'TAGC', mode='jaro'), 12)
1537
    0.166666666667
1538
    """
1539
    return 1 - sim_jaro_winkler(src, tar, qval, mode, long_strings,
1540
                                boost_threshold, scaling_factor)
1541
1542
1543
def lcsseq(src, tar):
1544
    """Return the longest common subsequence of two strings.
1545
1546
    Longest common subsequence (LCSseq) is the longest subsequence of
1547
    characters that two strings have in common.
1548
1549
    Based on the dynamic programming algorithm from
1550
    http://rosettacode.org/wiki/Longest_common_subsequence#Dynamic_Programming_6
1551
    :cite:`rosettacode:2018b`. This is licensed GFDL 1.2.
1552
1553
    Modifications include:
1554
        conversion to a numpy array in place of a list of lists
1555
1556
    :param str src, tar: two strings to be compared
1557
    :returns: the longes common subsequence
1558
    :rtype: str
1559
1560
    >>> lcsseq('cat', 'hat')
1561
    'at'
1562
    >>> lcsseq('Niall', 'Neil')
1563
    'Nil'
1564
    >>> lcsseq('aluminum', 'Catalan')
1565
    'aln'
1566
    >>> lcsseq('ATCG', 'TAGC')
1567
    'AC'
1568
    """
1569
    # pylint: disable=no-member
1570
    lengths = np_zeros((len(src)+1, len(tar)+1), dtype=np_int)
1571
    # pylint: enable=no-member
1572
1573
    # row 0 and column 0 are initialized to 0 already
1574
    for i, src_char in enumerate(src):
1575
        for j, tar_char in enumerate(tar):
1576
            if src_char == tar_char:
1577
                lengths[i+1, j+1] = lengths[i, j] + 1
1578
            else:
1579
                lengths[i+1, j+1] = max(lengths[i+1, j], lengths[i, j+1])
1580
1581
    # read the substring out from the matrix
1582
    result = ''
1583
    i, j = len(src), len(tar)
1584
    while i != 0 and j != 0:
1585
        if lengths[i, j] == lengths[i-1, j]:
1586
            i -= 1
1587
        elif lengths[i, j] == lengths[i, j-1]:
1588
            j -= 1
1589
        else:
1590
            result = src[i-1] + result
1591
            i -= 1
1592
            j -= 1
1593
    return result
1594
1595
1596
def sim_lcsseq(src, tar):
1597
    r"""Return the longest common subsequence similarity of two strings.
1598
1599
    Longest common subsequence similarity (:math:`sim_{LCSseq}`).
1600
1601
    This employs the LCSseq function to derive a similarity metric:
1602
    :math:`sim_{LCSseq}(s,t) = \\frac{|LCSseq(s,t)|}{max(|s|, |t|)}`
1603
1604
    :param str src, tar: two strings to be compared
1605
    :returns: LCSseq similarity
1606
    :rtype: float
1607
1608
    >>> sim_lcsseq('cat', 'hat')
1609
    0.6666666666666666
1610
    >>> sim_lcsseq('Niall', 'Neil')
1611
    0.6
1612
    >>> sim_lcsseq('aluminum', 'Catalan')
1613
    0.375
1614
    >>> sim_lcsseq('ATCG', 'TAGC')
1615
    0.5
1616
    """
1617
    if src == tar:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
1618
        return 1.0
1619
    elif not src or not tar:
1620
        return 0.0
1621
    return len(lcsseq(src, tar)) / max(len(src), len(tar))
1622
1623
1624
def dist_lcsseq(src, tar):
1625
    """Return the longest common subsequence distance between two strings.
1626
1627
    Longest common subsequence distance (:math:`dist_{LCSseq}`).
1628
1629
    This employs the LCSseq function to derive a similarity metric:
1630
    :math:`dist_{LCSseq}(s,t) = 1 - sim_{LCSseq}(s,t)`
1631
1632
    :param str src, tar: two strings to be compared
1633
    :returns: LCSseq distance
1634
    :rtype: float
1635
1636
    >>> dist_lcsseq('cat', 'hat')
1637
    0.33333333333333337
1638
    >>> dist_lcsseq('Niall', 'Neil')
1639
    0.4
1640
    >>> dist_lcsseq('aluminum', 'Catalan')
1641
    0.625
1642
    >>> dist_lcsseq('ATCG', 'TAGC')
1643
    0.5
1644
    """
1645
    return 1 - sim_lcsseq(src, tar)
1646
1647
1648
def lcsstr(src, tar):
1649
    """Return the longest common substring of two strings.
1650
1651
    Longest common substring (LCSstr).
1652
1653
    Based on the code from
1654
    https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring#Python
1655
    :cite:`Wikibooks:2018`.
1656
    This is licensed Creative Commons: Attribution-ShareAlike 3.0.
1657
1658
    Modifications include:
1659
1660
        - conversion to a numpy array in place of a list of lists
1661
        - conversion to Python 2/3-safe range from xrange via six
1662
1663
    :param str src, tar: two strings to be compared
1664
    :returns: the longes common substring
1665
    :rtype: float
1666
1667
    >>> lcsstr('cat', 'hat')
1668
    'at'
1669
    >>> lcsstr('Niall', 'Neil')
1670
    'N'
1671
    >>> lcsstr('aluminum', 'Catalan')
1672
    'al'
1673
    >>> lcsstr('ATCG', 'TAGC')
1674
    'A'
1675
    """
1676
    # pylint: disable=no-member
1677
    lengths = np_zeros((len(src)+1, len(tar)+1), dtype=np_int)
1678
    # pylint: enable=no-member
1679
    longest, i_longest = 0, 0
1680
    for i in range(1, len(src)+1):
1681
        for j in range(1, len(tar)+1):
1682
            if src[i-1] == tar[j-1]:
1683
                lengths[i, j] = lengths[i-1, j-1] + 1
1684
                if lengths[i, j] > longest:
1685
                    longest = lengths[i, j]
1686
                    i_longest = i
1687
            else:
1688
                lengths[i, j] = 0
1689
    return src[i_longest - longest:i_longest]
1690
1691
1692
def sim_lcsstr(src, tar):
1693
    r"""Return the longest common substring similarity of two strings.
1694
1695
    Longest common substring similarity (:math:`sim_{LCSstr}`).
1696
1697
    This employs the LCS function to derive a similarity metric:
1698
    :math:`sim_{LCSstr}(s,t) = \\frac{|LCSstr(s,t)|}{max(|s|, |t|)}`
1699
1700
    :param str src, tar: two strings to be compared
1701
    :returns: LCSstr similarity
1702
    :rtype: float
1703
1704
    >>> sim_lcsstr('cat', 'hat')
1705
    0.6666666666666666
1706
    >>> sim_lcsstr('Niall', 'Neil')
1707
    0.2
1708
    >>> sim_lcsstr('aluminum', 'Catalan')
1709
    0.25
1710
    >>> sim_lcsstr('ATCG', 'TAGC')
1711
    0.25
1712
    """
1713
    if src == tar:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
1714
        return 1.0
1715
    elif not src or not tar:
1716
        return 0.0
1717
    return len(lcsstr(src, tar)) / max(len(src), len(tar))
1718
1719
1720
def dist_lcsstr(src, tar):
1721
    """Return the longest common substring distance between two strings.
1722
1723
    Longest common substring distance (:math:`dist_{LCSstr}`).
1724
1725
    This employs the LCS function to derive a similarity metric:
1726
    :math:`dist_{LCSstr}(s,t) = 1 - sim_{LCSstr}(s,t)`
1727
1728
    :param str src, tar: two strings to be compared
1729
    :returns: LCSstr distance
1730
    :rtype: float
1731
1732
    >>> dist_lcsstr('cat', 'hat')
1733
    0.33333333333333337
1734
    >>> dist_lcsstr('Niall', 'Neil')
1735
    0.8
1736
    >>> dist_lcsstr('aluminum', 'Catalan')
1737
    0.75
1738
    >>> dist_lcsstr('ATCG', 'TAGC')
1739
    0.75
1740
    """
1741
    return 1 - sim_lcsstr(src, tar)
1742
1743
1744
def sim_ratcliff_obershelp(src, tar):
1745
    """Return the Ratcliff-Obershelp similarity of two strings.
1746
1747
    This follows the Ratcliff-Obershelp algorithm :cite:`Ratcliff:1988` to
1748
    derive a similarity measure:
1749
1750
        1. Find the length of the longest common substring in src & tar.
1751
        2. Recurse on the strings to the left & right of each this substring
1752
           in src & tar. The base case is a 0 length common substring, in which
1753
           case, return 0. Otherwise, return the sum of the current longest
1754
           common substring and the left & right recursed sums.
1755
        3. Multiply this length by 2 and divide by the sum of the lengths of
1756
           src & tar.
1757
1758
    Cf.
1759
    http://www.drdobbs.com/database/pattern-matching-the-gestalt-approach/184407970
1760
1761
    :param str src, tar: two strings to be compared
1762
    :returns: Ratcliff-Obserhelp similarity
1763
    :rtype: float
1764
1765
    >>> round(sim_ratcliff_obershelp('cat', 'hat'), 12)
1766
    0.666666666667
1767
    >>> round(sim_ratcliff_obershelp('Niall', 'Neil'), 12)
1768
    0.666666666667
1769
    >>> round(sim_ratcliff_obershelp('aluminum', 'Catalan'), 12)
1770
    0.4
1771
    >>> sim_ratcliff_obershelp('ATCG', 'TAGC')
1772
    0.5
1773
    """
1774
    def _lcsstr_stl(src, tar):
1775
        """Return start positions & length for Ratcliff-Obershelp.
1776
1777
        Return the start position in the source string, start position in
1778
        the target string, and length of the longest common substring of
1779
        strings src and tar.
1780
        """
1781
        # pylint: disable=no-member
1782
        lengths = np_zeros((len(src)+1, len(tar)+1), dtype=np_int)
1783
        # pylint: enable=no-member
1784
        longest, src_longest, tar_longest = 0, 0, 0
1785
        for i in range(1, len(src)+1):
1786
            for j in range(1, len(tar)+1):
1787
                if src[i-1] == tar[j-1]:
1788
                    lengths[i, j] = lengths[i-1, j-1] + 1
1789
                    if lengths[i, j] > longest:
1790
                        longest = lengths[i, j]
1791
                        src_longest = i
1792
                        tar_longest = j
1793
                else:
1794
                    lengths[i, j] = 0
1795
        return (src_longest-longest, tar_longest-longest, longest)
1796
1797
    def _sstr_matches(src, tar):
1798
        """Return the sum of substring match lengths.
1799
1800
        This follows the Ratcliff-Obershelp algorithm :cite:`Ratcliff:1988`:
1801
             1. Find the length of the longest common substring in src & tar.
1802
             2. Recurse on the strings to the left & right of each this
1803
                 substring in src & tar.
1804
             3. Base case is a 0 length common substring, in which case,
1805
                 return 0.
1806
             4. Return the sum.
1807
        """
1808
        src_start, tar_start, length = _lcsstr_stl(src, tar)
1809
        if length == 0:
1810
            return 0
1811
        return (_sstr_matches(src[:src_start], tar[:tar_start]) +
1812
                length +
1813
                _sstr_matches(src[src_start+length:], tar[tar_start+length:]))
1814
1815
    if src == tar:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
1816
        return 1.0
1817
    elif not src or not tar:
1818
        return 0.0
1819
    return 2*_sstr_matches(src, tar)/(len(src)+len(tar))
1820
1821
1822
def dist_ratcliff_obershelp(src, tar):
1823
    """Return the Ratcliff-Obershelp distance between two strings.
1824
1825
    Ratcliff-Obsershelp distance the complement of Ratcliff-Obershelp
1826
    similarity:
1827
    :math:`dist_{Ratcliff-Obershelp} = 1 - sim_{Ratcliff-Obershelp}`.
1828
1829
    :param str src, tar: two strings to be compared
1830
    :returns: Ratcliffe-Obershelp distance
1831
    :rtype: float
1832
1833
    >>> round(dist_ratcliff_obershelp('cat', 'hat'), 12)
1834
    0.333333333333
1835
    >>> round(dist_ratcliff_obershelp('Niall', 'Neil'), 12)
1836
    0.333333333333
1837
    >>> round(dist_ratcliff_obershelp('aluminum', 'Catalan'), 12)
1838
    0.6
1839
    >>> dist_ratcliff_obershelp('ATCG', 'TAGC')
1840
    0.5
1841
    """
1842
    return 1 - sim_ratcliff_obershelp(src, tar)
1843
1844
1845
def mra_compare(src, tar):
1846
    """Return the MRA comparison rating of two strings.
1847
1848
    The Western Airlines Surname Match Rating Algorithm comparison rating, as
1849
    presented on page 18 of :cite:`Moore:1977`.
1850
1851
    :param str src, tar: two strings to be compared
1852
    :returns: MRA comparison rating
1853
    :rtype: int
1854
1855
    >>> mra_compare('cat', 'hat')
1856
    5
1857
    >>> mra_compare('Niall', 'Neil')
1858
    6
1859
    >>> mra_compare('aluminum', 'Catalan')
1860
    0
1861
    >>> mra_compare('ATCG', 'TAGC')
1862
    5
1863
    """
1864
    if src == tar:
1865
        return 6
1866
    if src == '' or tar == '':
1867
        return 0
1868
    src = list(mra(src))
1869
    tar = list(mra(tar))
1870
1871
    if abs(len(src)-len(tar)) > 2:
1872
        return 0
1873
1874
    length_sum = len(src) + len(tar)
1875
    if length_sum < 5:
1876
        min_rating = 5
1877
    elif length_sum < 8:
1878
        min_rating = 4
1879
    elif length_sum < 12:
1880
        min_rating = 3
1881
    else:
1882
        min_rating = 2
1883
1884
    for _ in range(2):
1885
        new_src = []
1886
        new_tar = []
1887
        minlen = min(len(src), len(tar))
1888
        for i in range(minlen):
1889
            if src[i] != tar[i]:
1890
                new_src.append(src[i])
1891
                new_tar.append(tar[i])
1892
        src = new_src+src[minlen:]
1893
        tar = new_tar+tar[minlen:]
1894
        src.reverse()
1895
        tar.reverse()
1896
1897
    similarity = 6 - max(len(src), len(tar))
1898
1899
    if similarity >= min_rating:
1900
        return similarity
1901
    return 0
1902
1903
1904
def sim_mra(src, tar):
1905
    """Return the normalized MRA similarity of two strings.
1906
1907
    This is the MRA normalized to :math:`[0, 1]`, given that MRA itself is
1908
    constrained to the range :math:`[0, 6]`.
1909
1910
    :param str src, tar: two strings to be compared
1911
    :returns: normalized MRA similarity
1912
    :rtype: float
1913
1914
    >>> sim_mra('cat', 'hat')
1915
    0.8333333333333334
1916
    >>> sim_mra('Niall', 'Neil')
1917
    1.0
1918
    >>> sim_mra('aluminum', 'Catalan')
1919
    0.0
1920
    >>> sim_mra('ATCG', 'TAGC')
1921
    0.8333333333333334
1922
    """
1923
    return mra_compare(src, tar)/6
1924
1925
1926
def dist_mra(src, tar):
1927
    """Return the normalized MRA distance between two strings.
1928
1929
    MRA distance is the complement of MRA similarity:
1930
    :math:`dist_{MRA} = 1 - sim_{MRA}`.
1931
1932
    :param str src, tar: two strings to be compared
1933
    :returns: normalized MRA distance
1934
    :rtype: float
1935
1936
    >>> dist_mra('cat', 'hat')
1937
    0.16666666666666663
1938
    >>> dist_mra('Niall', 'Neil')
1939
    0.0
1940
    >>> dist_mra('aluminum', 'Catalan')
1941
    1.0
1942
    >>> dist_mra('ATCG', 'TAGC')
1943
    0.16666666666666663
1944
    """
1945
    return 1 - sim_mra(src, tar)
1946
1947
1948
def dist_compression(src, tar, compressor='bz2', probs=None):
1949
    """Return the normalized compression distance between two strings.
1950
1951
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
1952
1953
    :param str src, tar: two strings to be compared
1954
    :param str compressor: a compression scheme to use for the similarity
1955
        calculation, from the following:
1956
1957
            - `zlib` -- standard zlib/gzip
1958
            - `bz2` -- bzip2 (default)
1959
            - `lzma` -- Lempel–Ziv–Markov chain algorithm
1960
            - `arith` -- arithmetic coding
1961
            - `rle` -- run-length encoding
1962
            - `bwtrle` -- Burrows-Wheeler transform followed by run-length
1963
              encoding
1964
1965
    :param doct probs: a dictionary trained with ac_train (for the arith
1966
        compressor only)
1967
    :returns: compression distance
1968
    :rtype: float
1969
1970
    >>> dist_compression('cat', 'hat')
1971
    0.08
1972
    >>> dist_compression('Niall', 'Neil')
1973
    0.037037037037037035
1974
    >>> dist_compression('aluminum', 'Catalan')
1975
    0.20689655172413793
1976
    >>> dist_compression('ATCG', 'TAGC')
1977
    0.037037037037037035
1978
1979
    >>> dist_compression('Niall', 'Neil', compressor='zlib')
1980
    0.45454545454545453
1981
    >>> dist_compression('Niall', 'Neil', compressor='bz2')
1982
    0.037037037037037035
1983
    >>> dist_compression('Niall', 'Neil', compressor='lzma')
1984
    0.16
1985
    >>> dist_compression('Niall', 'Neil', compressor='arith')
1986
    0.6875
1987
    >>> dist_compression('Niall', 'Neil', compressor='rle')
1988
    1.0
1989
    >>> dist_compression('Niall', 'Neil', compressor='bwtrle')
1990
    0.8333333333333334
1991
    """
1992
    if src == tar:
1993
        return 0.0
1994
1995
    if compressor not in {'arith', 'rle', 'bwtrle'}:
1996
        src = src.encode('utf-8')
1997
        tar = tar.encode('utf-8')
1998
1999
    if compressor == 'bz2':
2000
        src_comp = encode(src, 'bz2_codec')[15:]
2001
        tar_comp = encode(tar, 'bz2_codec')[15:]
2002
        concat_comp = encode(src+tar, 'bz2_codec')[15:]
2003
        concat_comp2 = encode(tar+src, 'bz2_codec')[15:]
2004
    elif compressor == 'lzma':
2005
        if 'lzma' in modules:
2006
            src_comp = lzma.compress(src)[14:]
2007
            tar_comp = lzma.compress(tar)[14:]
2008
            concat_comp = lzma.compress(src+tar)[14:]
2009
            concat_comp2 = lzma.compress(tar+src)[14:]
2010
        else:  # pragma: no cover
2011
            raise ValueError('Install the PylibLZMA module in order to use ' +
2012
                             'lzma compression similarity')
2013
    elif compressor == 'arith':
2014
        if probs is None:
2015
            # lacking a reasonable dictionary, train on the strings themselves
2016
            probs = ac_train(src+tar)
2017
        src_comp = ac_encode(src, probs)[1]
2018
        tar_comp = ac_encode(tar, probs)[1]
2019
        concat_comp = ac_encode(src+tar, probs)[1]
2020
        concat_comp2 = ac_encode(tar+src, probs)[1]
2021
        return ((min(concat_comp, concat_comp2) - min(src_comp, tar_comp)) /
2022
                max(src_comp, tar_comp))
2023
    elif compressor in {'rle', 'bwtrle'}:
2024
        src_comp = rle_encode(src, (compressor == 'bwtrle'))
2025
        tar_comp = rle_encode(tar, (compressor == 'bwtrle'))
2026
        concat_comp = rle_encode(src+tar, (compressor == 'bwtrle'))
2027
        concat_comp2 = rle_encode(tar+src, (compressor == 'bwtrle'))
2028
    else:  # zlib
2029
        src_comp = encode(src, 'zlib_codec')[2:]
2030
        tar_comp = encode(tar, 'zlib_codec')[2:]
2031
        concat_comp = encode(src+tar, 'zlib_codec')[2:]
2032
        concat_comp2 = encode(tar+src, 'zlib_codec')[2:]
2033
    return ((min(len(concat_comp), len(concat_comp2)) -
2034
             min(len(src_comp), len(tar_comp))) /
2035
            max(len(src_comp), len(tar_comp)))
2036
2037
2038
def sim_compression(src, tar, compressor='bz2', probs=None):
2039
    """Return the normalized compression similarity of two strings.
2040
2041
    Normalized compression similarity is the complement of normalized
2042
    compression distance:
2043
    :math:`sim_{NCS} = 1 - dist_{NCD}`.
2044
2045
    :param str src, tar: two strings to be compared
2046
    :param str compressor: a compression scheme to use for the similarity
2047
        calculation:
2048
2049
            - `zlib` -- standard zlib/gzip
2050
            - `bz2` -- bzip2 (default)
2051
            - `lzma` -- Lempel–Ziv–Markov chain algorithm
2052
            - `arith` -- arithmetic coding
2053
            - `rle` -- run-length encoding
2054
            - `bwtrle` -- Burrows-Wheeler transform followed by run-length
2055
              encoding
2056
2057
    :param dict probs: a dictionary trained with ac_train (for the arith
2058
        compressor only)
2059
    :returns: compression similarity
2060
    :rtype: float
2061
2062
    >>> sim_compression('cat', 'hat')
2063
    0.92
2064
    >>> sim_compression('Niall', 'Neil')
2065
    0.962962962962963
2066
    >>> sim_compression('aluminum', 'Catalan')
2067
    0.7931034482758621
2068
    >>> sim_compression('ATCG', 'TAGC')
2069
    0.962962962962963
2070
2071
    >>> sim_compression('Niall', 'Neil', compressor='zlib')
2072
    0.5454545454545454
2073
    >>> sim_compression('Niall', 'Neil', compressor='bz2')
2074
    0.962962962962963
2075
    >>> sim_compression('Niall', 'Neil', compressor='lzma')
2076
    0.84
2077
    >>> sim_compression('Niall', 'Neil', compressor='arith')
2078
    0.3125
2079
    >>> sim_compression('Niall', 'Neil', compressor='rle')
2080
    0.0
2081
    >>> sim_compression('Niall', 'Neil', compressor='bwtrle')
2082
    0.16666666666666663
2083
    """
2084
    return 1 - dist_compression(src, tar, compressor, probs)
2085
2086
2087
def sim_monge_elkan(src, tar, sim_func=sim_levenshtein, symmetric=False):
2088
    """Return the Monge-Elkan similarity of two strings.
2089
2090
    Monge-Elkan is defined in :cite:`Monge:1996`.
2091
2092
    Note: Monge-Elkan is NOT a symmetric similarity algoritm. Thus, the
2093
    similarity of src to tar is not necessarily equal to the similarity of
2094
    tar to src. If the sym argument is True, a symmetric value is calculated,
2095
    at the cost of doubling the computation time (since the
2096
    :math:`sim_{Monge-Elkan}(src, tar)` and
2097
    :math:`sim_{Monge-Elkan}(tar, src)` are both calculated and then averaged).
2098
2099
    :param str src, tar: two strings to be compared
2100
    :param function sim_func: the internal similarity metric to emply
2101
    :param bool symmetric: return a symmetric similarity measure
2102
    :returns: Monge-Elkan similarity
2103
    :rtype: float
2104
2105
    >>> sim_monge_elkan('cat', 'hat')
2106
    0.75
2107
    >>> round(sim_monge_elkan('Niall', 'Neil'), 12)
2108
    0.666666666667
2109
    >>> round(sim_monge_elkan('aluminum', 'Catalan'), 12)
2110
    0.388888888889
2111
    >>> sim_monge_elkan('ATCG', 'TAGC')
2112
    0.5
2113
    """
2114
    if src == tar:
2115
        return 1.0
2116
2117
    q_src = sorted(QGrams(src).elements())
2118
    q_tar = sorted(QGrams(tar).elements())
2119
2120
    if not q_src or not q_tar:
2121
        return 0.0
2122
2123
    sum_of_maxes = 0
2124
    for q_s in q_src:
2125
        max_sim = float('-inf')
2126
        for q_t in q_tar:
2127
            max_sim = max(max_sim, sim_func(q_s, q_t))
2128
        sum_of_maxes += max_sim
2129
    sim_em = sum_of_maxes / len(q_src)
2130
2131
    if symmetric:
2132
        sim_em = (sim_em + sim_monge_elkan(tar, src, sim, False))/2
2133
2134
    return sim_em
2135
2136
2137
def dist_monge_elkan(src, tar, sim_func=sim_levenshtein, symmetric=False):
2138
    """Return the Monge-Elkan distance between two strings.
2139
2140
    Monge-Elkan distance is the complement of Monge-Elkan similarity:
2141
    :math:`dist_{Monge-Elkan} = 1 - sim_{Monge-Elkan}`.
2142
2143
    :param str src, tar: two strings to be compared
2144
    :param function sim_func: the internal similarity metric to emply
2145
    :param bool symmetric: return a symmetric similarity measure
2146
    :returns: Monge-Elkan distance
2147
    :rtype: float
2148
2149
    >>> dist_monge_elkan('cat', 'hat')
2150
    0.25
2151
    >>> round(dist_monge_elkan('Niall', 'Neil'), 12)
2152
    0.333333333333
2153
    >>> round(dist_monge_elkan('aluminum', 'Catalan'), 12)
2154
    0.611111111111
2155
    >>> dist_monge_elkan('ATCG', 'TAGC')
2156
    0.5
2157
    """
2158
    return 1 - sim_monge_elkan(src, tar, sim_func, symmetric)
2159
2160
2161
def sim_ident(src, tar):
2162
    """Return the identity similarity of two strings.
2163
2164
    Identity similarity is 1 if the two strings are identical, otherwise 0.
2165
2166
    :param str src, tar: two strings to be compared
2167
    :returns: identity similarity
2168
    :rtype: int
2169
2170
    >>> sim_ident('cat', 'hat')
2171
    0
2172
    >>> sim_ident('cat', 'cat')
2173
    1
2174
    """
2175
    return int(src == tar)
2176
2177
2178
def dist_ident(src, tar):
2179
    """Return the identity distance between two strings.
2180
2181
    This is 0 if the two strings are identical, otherwise 1, i.e.
2182
    :math:`dist_{identity} = 1 - sim_{identity}`.
2183
2184
    :param str src, tar: two strings to be compared
2185
    :returns: indentity distance
2186
    :rtype: int
2187
2188
    >>> dist_ident('cat', 'hat')
2189
    1
2190
    >>> dist_ident('cat', 'cat')
2191
    0
2192
    """
2193
    return 1 - sim_ident(src, tar)
2194
2195
2196
def sim_matrix(src, tar, mat=None, mismatch_cost=0, match_cost=1,
0 ignored issues
show
best-practice introduced by
Too many arguments (7/5)
Loading history...
2197
               symmetric=True, alphabet=None):
2198
    """Return the matrix similarity of two strings.
2199
2200
    With the default parameters, this is identical to sim_ident.
2201
    It is possible for sim_matrix to return values outside of the range
2202
    :math:`[0, 1]`, if values outside that range are present in mat,
2203
    mismatch_cost, or match_cost.
2204
2205
    :param str src, tar: two strings to be compared
2206
    :param dict mat: a dict mapping tuples to costs; the tuples are (src, tar)
2207
        pairs of symbols from the alphabet parameter
2208
    :param float mismatch_cost: the value returned if (src, tar) is absent from
2209
        mat when src does not equal tar
2210
    :param float match_cost: the value returned if (src, tar) is absent from
2211
        mat when src equals tar
2212
    :param bool symmetric: True if the cost of src not matching tar is
2213
        identical to the cost of tar not matching src; in this case, the values
2214
        in mat need only contain (src, tar) or (tar, src), not both
2215
    :param str alphabet: a collection of tokens from which src and tar are
2216
        drawn; if this is defined a ValueError is raised if either tar or src
2217
        is not found in alphabet
2218
    :returns: matrix similarity
2219
    :rtype: float
2220
2221
    >>> sim_matrix('cat', 'hat')
2222
    0
2223
    >>> sim_matrix('hat', 'hat')
2224
    1
2225
    """
2226
    if alphabet:
2227
        alphabet = tuple(alphabet)
2228
        for i in src:
2229
            if i not in alphabet:
2230
                raise ValueError('src value not in alphabet')
2231
        for i in tar:
2232
            if i not in alphabet:
2233
                raise ValueError('tar value not in alphabet')
2234
2235
    if src == tar:
2236
        if mat and (src, src) in mat:
2237
            return mat[(src, src)]
2238
        return match_cost
2239
    if mat and (src, tar) in mat:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
2240
        return mat[(src, tar)]
2241
    elif symmetric and mat and (tar, src) in mat:
2242
        return mat[(tar, src)]
2243
    return mismatch_cost
2244
2245
2246 View Code Duplication
def needleman_wunsch(src, tar, gap_cost=1, sim_func=sim_ident):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
2247
    """Return the Needleman-Wunsch score of two strings.
2248
2249
    The Needleman-Wunsch score :cite:`Needleman:1970` is a standard edit
2250
    distance measure.
2251
2252
    :param str src, tar: two strings to be compared
2253
    :param float gap_cost: the cost of an alignment gap (1 by default)
2254
    :param function sim_func: a function that returns the similarity of two
2255
        characters (identity similarity by default)
2256
    :returns: Needleman-Wunsch score
2257
    :rtype: int (in fact dependent on the gap_cost & return value of sim_func)
2258
2259
    >>> needleman_wunsch('cat', 'hat')
2260
    2.0
2261
    >>> needleman_wunsch('Niall', 'Neil')
2262
    1.0
2263
    >>> needleman_wunsch('aluminum', 'Catalan')
2264
    -1.0
2265
    >>> needleman_wunsch('ATCG', 'TAGC')
2266
    0.0
2267
    """
2268
    # pylint: disable=no-member
2269
    d_mat = np_zeros((len(src)+1, len(tar)+1), dtype=np_float32)
2270
    # pylint: enable=no-member
2271
2272
    for i in range(len(src)+1):
2273
        d_mat[i, 0] = -(i * gap_cost)
2274
    for j in range(len(tar)+1):
2275
        d_mat[0, j] = -(j * gap_cost)
2276
    for i in range(1, len(src)+1):
2277
        for j in range(1, len(tar)+1):
2278
            match = d_mat[i-1, j-1] + sim_func(src[i-1], tar[j-1])
2279
            delete = d_mat[i-1, j] - gap_cost
2280
            insert = d_mat[i, j-1] - gap_cost
2281
            d_mat[i, j] = max(match, delete, insert)
2282
    return d_mat[d_mat.shape[0]-1, d_mat.shape[1]-1]
2283
2284
2285 View Code Duplication
def smith_waterman(src, tar, gap_cost=1, sim_func=sim_ident):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
2286
    """Return the Smith-Waterman score of two strings.
2287
2288
    The Smith-Waterman score :cite:`Smith:1981` is a standard edit distance
2289
    measure, differing from Needleman-Wunsch in that it focuses on local
2290
    alignment and disallows negative scores.
2291
2292
    :param str src, tar: two strings to be compared
2293
    :param float gap_cost: the cost of an alignment gap (1 by default)
2294
    :param function sim_func: a function that returns the similarity of two
2295
        characters (identity similarity by default)
2296
    :returns: Smith-Waterman score
2297
    :rtype: int (in fact dependent on the gap_cost & return value of sim_func)
2298
2299
    >>> smith_waterman('cat', 'hat')
2300
    2.0
2301
    >>> smith_waterman('Niall', 'Neil')
2302
    1.0
2303
    >>> smith_waterman('aluminum', 'Catalan')
2304
    0.0
2305
    >>> smith_waterman('ATCG', 'TAGC')
2306
    1.0
2307
    """
2308
    # pylint: disable=no-member
2309
    d_mat = np_zeros((len(src)+1, len(tar)+1), dtype=np_float32)
2310
    # pylint: enable=no-member
2311
2312
    for i in range(len(src)+1):
2313
        d_mat[i, 0] = 0
2314
    for j in range(len(tar)+1):
2315
        d_mat[0, j] = 0
2316
    for i in range(1, len(src)+1):
2317
        for j in range(1, len(tar)+1):
2318
            match = d_mat[i-1, j-1] + sim_func(src[i-1], tar[j-1])
2319
            delete = d_mat[i-1, j] - gap_cost
2320
            insert = d_mat[i, j-1] - gap_cost
2321
            d_mat[i, j] = max(0, match, delete, insert)
2322
    return d_mat[d_mat.shape[0]-1, d_mat.shape[1]-1]
2323
2324
2325
def gotoh(src, tar, gap_open=1, gap_ext=0.4, sim_func=sim_ident):
2326
    """Return the Gotoh score of two strings.
2327
2328
    The Gotoh score :cite:`Gotoh:1982` is essentially Needleman-Wunsch with
2329
    affine gap penalties.
2330
2331
    :param str src, tar: two strings to be compared
2332
    :param float gap_open: the cost of an open alignment gap (1 by default)
2333
    :param float gap_ext: the cost of an alignment gap extension (0.4 by
2334
        default)
2335
    :param function sim_func: a function that returns the similarity of two
2336
        characters (identity similarity by default)
2337
    :returns: Gotoh score
2338
    :rtype: float (in fact dependent on the gap_cost & return value of
2339
        sim_func)
2340
2341
    >>> gotoh('cat', 'hat')
2342
    2.0
2343
    >>> gotoh('Niall', 'Neil')
2344
    1.0
2345
    >>> round(gotoh('aluminum', 'Catalan'), 12)
2346
    -0.4
2347
    >>> gotoh('cat', 'hat')
2348
    2.0
2349
    """
2350
    # pylint: disable=no-member
2351
    d_mat = np_zeros((len(src)+1, len(tar)+1), dtype=np_float32)
2352
    p_mat = np_zeros((len(src)+1, len(tar)+1), dtype=np_float32)
2353
    q_mat = np_zeros((len(src)+1, len(tar)+1), dtype=np_float32)
2354
    # pylint: enable=no-member
2355
2356
    d_mat[0, 0] = 0
2357
    p_mat[0, 0] = float('-inf')
2358
    q_mat[0, 0] = float('-inf')
2359
    for i in range(1, len(src)+1):
2360
        d_mat[i, 0] = float('-inf')
2361
        p_mat[i, 0] = -gap_open - gap_ext*(i-1)
2362
        q_mat[i, 0] = float('-inf')
2363
        q_mat[i, 1] = -gap_open
2364
    for j in range(1, len(tar)+1):
2365
        d_mat[0, j] = float('-inf')
2366
        p_mat[0, j] = float('-inf')
2367
        p_mat[1, j] = -gap_open
2368
        q_mat[0, j] = -gap_open - gap_ext*(j-1)
2369
2370
    for i in range(1, len(src)+1):
2371
        for j in range(1, len(tar)+1):
2372
            sim_val = sim_func(src[i-1], tar[j-1])
2373
            d_mat[i, j] = max(d_mat[i-1, j-1] + sim_val,
2374
                              p_mat[i-1, j-1] + sim_val,
2375
                              q_mat[i-1, j-1] + sim_val)
2376
2377
            p_mat[i, j] = max(d_mat[i-1, j] - gap_open,
2378
                              p_mat[i-1, j] - gap_ext)
2379
2380
            q_mat[i, j] = max(d_mat[i, j-1] - gap_open,
2381
                              q_mat[i, j-1] - gap_ext)
2382
2383
    i, j = (n - 1 for n in d_mat.shape)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable n does not seem to be defined.
Loading history...
2384
    return max(d_mat[i, j], p_mat[i, j], q_mat[i, j])
2385
2386
2387
def sim_length(src, tar):
2388
    """Return the length similarty of two strings.
2389
2390
    Length similarity is the ratio of the length of the shorter string to the
2391
    longer.
2392
2393
    :param str src, tar: two strings to be compared
2394
    :returns: length similarity
2395
    :rtype: float
2396
2397
    >>> sim_length('cat', 'hat')
2398
    1.0
2399
    >>> sim_length('Niall', 'Neil')
2400
    0.8
2401
    >>> sim_length('aluminum', 'Catalan')
2402
    0.875
2403
    >>> sim_length('ATCG', 'TAGC')
2404
    1.0
2405
    """
2406
    if src == tar:
2407
        return 1.0
2408
    if not src or not tar:
2409
        return 0.0
2410
    return len(src)/len(tar) if len(src) < len(tar) else len(tar)/len(src)
2411
2412
2413
def dist_length(src, tar):
2414
    """Return the length distance between two strings.
2415
2416
    Length distance is the complement of length similarity:
2417
    :math:`dist_{length} = 1 - sim_{length}`.
2418
2419
    :param str src, tar: two strings to be compared
2420
    :returns: length distance
2421
    :rtype: float
2422
2423
    >>> dist_length('cat', 'hat')
2424
    0.0
2425
    >>> dist_length('Niall', 'Neil')
2426
    0.19999999999999996
2427
    >>> dist_length('aluminum', 'Catalan')
2428
    0.125
2429
    >>> dist_length('ATCG', 'TAGC')
2430
    0.0
2431
    """
2432
    return 1 - sim_length(src, tar)
2433
2434
2435 View Code Duplication
def sim_prefix(src, tar):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
2436
    """Return the prefix similarty of two strings.
2437
2438
    Prefix similarity is the ratio of the length of the shorter term that
2439
    exactly matches the longer term to the length of the shorter term,
2440
    beginning at the start of both terms.
2441
2442
    :param str src, tar: two strings to be compared
2443
    :returns: prefix similarity
2444
    :rtype: float
2445
2446
    >>> sim_prefix('cat', 'hat')
2447
    0.0
2448
    >>> sim_prefix('Niall', 'Neil')
2449
    0.25
2450
    >>> sim_prefix('aluminum', 'Catalan')
2451
    0.0
2452
    >>> sim_prefix('ATCG', 'TAGC')
2453
    0.0
2454
    """
2455
    if src == tar:
2456
        return 1.0
2457
    if not src or not tar:
2458
        return 0.0
2459
    min_word, max_word = (src, tar) if len(src) < len(tar) else (tar, src)
2460
    min_len = len(min_word)
2461
    for i in range(min_len, 0, -1):
2462
        if min_word[:i] == max_word[:i]:
2463
            return i/min_len
2464
    return 0.0
2465
2466
2467
def dist_prefix(src, tar):
2468
    """Return the prefix distance between two strings.
2469
2470
    Prefix distance is the complement of prefix similarity:
2471
    :math:`dist_{prefix} = 1 - sim_{prefix}`.
2472
2473
    :param str src, tar: two strings to be compared
2474
    :returns: prefix distance
2475
    :rtype: float
2476
2477
    >>> dist_prefix('cat', 'hat')
2478
    1.0
2479
    >>> dist_prefix('Niall', 'Neil')
2480
    0.75
2481
    >>> dist_prefix('aluminum', 'Catalan')
2482
    1.0
2483
    >>> dist_prefix('ATCG', 'TAGC')
2484
    1.0
2485
    """
2486
    return 1 - sim_prefix(src, tar)
2487
2488
2489 View Code Duplication
def sim_suffix(src, tar):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
2490
    """Return the suffix similarity of two strings.
2491
2492
    Suffix similarity is the ratio of the length of the shorter term that
2493
    exactly matches the longer term to the length of the shorter term,
2494
    beginning at the end of both terms.
2495
2496
    :param str src, tar: two strings to be compared
2497
    :returns: suffix similarity
2498
    :rtype: float
2499
2500
    >>> sim_suffix('cat', 'hat')
2501
    0.6666666666666666
2502
    >>> sim_suffix('Niall', 'Neil')
2503
    0.25
2504
    >>> sim_suffix('aluminum', 'Catalan')
2505
    0.0
2506
    >>> sim_suffix('ATCG', 'TAGC')
2507
    0.0
2508
    """
2509
    if src == tar:
2510
        return 1.0
2511
    if not src or not tar:
2512
        return 0.0
2513
    min_word, max_word = (src, tar) if len(src) < len(tar) else (tar, src)
2514
    min_len = len(min_word)
2515
    for i in range(min_len, 0, -1):
2516
        if min_word[-i:] == max_word[-i:]:
2517
            return i/min_len
2518
    return 0.0
2519
2520
2521
def dist_suffix(src, tar):
2522
    """Return the suffix distance between two strings.
2523
2524
    Suffix distance is the complement of suffix similarity:
2525
    :math:`dist_{suffix} = 1 - sim_{suffix}`.
2526
2527
    :param str src, tar: two strings to be compared
2528
    :returns: suffix distance
2529
    :rtype: float
2530
2531
    >>> dist_suffix('cat', 'hat')
2532
    0.33333333333333337
2533
    >>> dist_suffix('Niall', 'Neil')
2534
    0.75
2535
    >>> dist_suffix('aluminum', 'Catalan')
2536
    1.0
2537
    >>> dist_suffix('ATCG', 'TAGC')
2538
    1.0
2539
    """
2540
    return 1 - sim_suffix(src, tar)
2541
2542
2543
def sim_mlipns(src, tar, threshold=0.25, maxmismatches=2):
2544
    """Return the MLIPNS similarity of two strings.
2545
2546
    Modified Language-Independent Product Name Search (MLIPNS) is described in
2547
    :cite:`Shannaq:2010`. This function returns only 1.0 (similar) or 0.0
2548
    (not similar). LIPNS similarity is identical to normalized Hamming
2549
    similarity.
2550
2551
    :param str src, tar: two strings to be compared
2552
    :param float threshold: a number [0, 1] indicating the maximum similarity
2553
        score, below which the strings are considered 'similar' (0.25 by
2554
        default)
2555
    :param int maxmismatches: a number indicating the allowable number of
2556
        mismatches to remove before declaring two strings not similar (2 by
2557
        default)
2558
    :returns: MLIPNS similarity
2559
    :rtype: float
2560
2561
    >>> sim_mlipns('cat', 'hat')
2562
    1.0
2563
    >>> sim_mlipns('Niall', 'Neil')
2564
    0.0
2565
    >>> sim_mlipns('aluminum', 'Catalan')
2566
    0.0
2567
    >>> sim_mlipns('ATCG', 'TAGC')
2568
    0.0
2569
    """
2570
    if tar == src:
2571
        return 1.0
2572
    if not src or not tar:
2573
        return 0.0
2574
2575
    mismatches = 0
2576
    ham = hamming(src, tar, difflens=True)
2577
    maxlen = max(len(src), len(tar))
2578
    while src and tar and mismatches <= maxmismatches:
2579
        if maxlen < 1 or (1-(maxlen-ham)/maxlen) <= threshold:
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
2580
            return 1.0
2581
        else:
2582
            mismatches += 1
2583
            ham -= 1
2584
            maxlen -= 1
2585
2586
    if maxlen < 1:
2587
        return 1.0
2588
    return 0.0
2589
2590
2591
def dist_mlipns(src, tar, threshold=0.25, maxmismatches=2):
2592
    """Return the MLIPNS distance between two strings.
2593
2594
    MLIPNS distance is the complement of MLIPNS similarity:
2595
    :math:`dist_{MLIPNS} = 1 - sim_{MLIPNS}`. This function returns only 0.0
2596
    (distant) or 1.0 (not distant).
2597
2598
    :param str src, tar: two strings to be compared
2599
    :param float threshold: a number [0, 1] indicating the maximum similarity
2600
        score, below which the strings are considered 'similar' (0.25 by
2601
        default)
2602
    :param int maxmismatches: a number indicating the allowable number of
2603
        mismatches to remove before declaring two strings not similar (2 by
2604
        default)
2605
    :returns: MLIPNS distance
2606
    :rtype: float
2607
2608
    >>> dist_mlipns('cat', 'hat')
2609
    0.0
2610
    >>> dist_mlipns('Niall', 'Neil')
2611
    1.0
2612
    >>> dist_mlipns('aluminum', 'Catalan')
2613
    1.0
2614
    >>> dist_mlipns('ATCG', 'TAGC')
2615
    1.0
2616
    """
2617
    return 1.0 - sim_mlipns(src, tar, threshold, maxmismatches)
2618
2619
2620
def bag(src, tar):
2621
    """Return the bag distance between two strings.
2622
2623
    Bag distance is proposed in :cite:`Bartolini:2002`. It is defined as:
2624
    :math:`max(|multiset(src)-multiset(tar)|, |multiset(tar)-multiset(src)|)`.
2625
2626
    :param str src, tar: two strings to be compared
2627
    :returns: bag distance
2628
    :rtype: int
2629
2630
    >>> bag('cat', 'hat')
2631
    1
2632
    >>> bag('Niall', 'Neil')
2633
    2
2634
    >>> bag('aluminum', 'Catalan')
2635
    5
2636
    >>> bag('ATCG', 'TAGC')
2637
    0
2638
    >>> bag('abcdefg', 'hijklm')
2639
    7
2640
    >>> bag('abcdefg', 'hijklmno')
2641
    8
2642
    """
2643
    if tar == src:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
2644
        return 0
2645
    elif not src:
2646
        return len(tar)
2647
    elif not tar:
2648
        return len(src)
2649
2650
    src_bag = Counter(src)
2651
    tar_bag = Counter(tar)
2652
    return max(sum((src_bag-tar_bag).values()),
2653
               sum((tar_bag-src_bag).values()))
2654
2655
2656
def dist_bag(src, tar):
2657
    """Return the normalized bag distance between two strings.
2658
2659
    Bag distance is normalized by dividing by :math:`max( |src|, |tar| )`.
2660
2661
    :param str src, tar: two strings to be compared
2662
    :returns: normalized bag distance
2663
    :rtype: float
2664
2665
    >>> dist_bag('cat', 'hat')
2666
    0.3333333333333333
2667
    >>> dist_bag('Niall', 'Neil')
2668
    0.4
2669
    >>> dist_bag('aluminum', 'Catalan')
2670
    0.625
2671
    >>> dist_bag('ATCG', 'TAGC')
2672
    0.0
2673
    """
2674
    if tar == src:
2675
        return 0.0
2676
    if not src or not tar:
2677
        return 1.0
2678
2679
    maxlen = max(len(src), len(tar))
2680
2681
    return bag(src, tar)/maxlen
2682
2683
2684
def sim_bag(src, tar):
2685
    """Return the normalized bag similarity of two strings.
2686
2687
    Normalized bag similarity is the complement of normalized bag distance:
2688
    :math:`sim_{bag} = 1 - dist_{bag}`.
2689
2690
    :param str src, tar: two strings to be compared
2691
    :returns: normalized bag similarity
2692
    :rtype: float
2693
2694
    >>> round(sim_bag('cat', 'hat'), 12)
2695
    0.666666666667
2696
    >>> sim_bag('Niall', 'Neil')
2697
    0.6
2698
    >>> sim_bag('aluminum', 'Catalan')
2699
    0.375
2700
    >>> sim_bag('ATCG', 'TAGC')
2701
    1.0
2702
    """
2703
    return 1-dist_bag(src, tar)
2704
2705
2706
def editex(src, tar, cost=(0, 1, 2), local=False):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (16/15).
Loading history...
2707
    """Return the Editex distance between two strings.
2708
2709
    As described on pages 3 & 4 of :cite:`Zobel:1996`.
2710
2711
    The local variant is based on :cite:`Ring:2009`.
2712
2713
    :param str src, tar: two strings to be compared
2714
    :param tuple cost: a 3-tuple representing the cost of the four possible
2715
        edits:
2716
        match, same-group, and mismatch respectively (by default: (0, 1, 2))
2717
    :param bool local: if True, the local variant of Editex is used
2718
    :returns: Editex distance
2719
    :rtype: int
2720
2721
    >>> editex('cat', 'hat')
2722
    2
2723
    >>> editex('Niall', 'Neil')
2724
    2
2725
    >>> editex('aluminum', 'Catalan')
2726
    12
2727
    >>> editex('ATCG', 'TAGC')
2728
    6
2729
    """
2730
    match_cost, group_cost, mismatch_cost = cost
2731
    letter_groups = ({'A', 'E', 'I', 'O', 'U', 'Y'},
2732
                     {'B', 'P'},
2733
                     {'C', 'K', 'Q'},
2734
                     {'D', 'T'},
2735
                     {'L', 'R'},
2736
                     {'M', 'N'},
2737
                     {'G', 'J'},
2738
                     {'F', 'P', 'V'},
2739
                     {'S', 'X', 'Z'},
2740
                     {'C', 'S', 'Z'})
2741
    all_letters = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'J', 'K', 'L', 'M',
2742
                   'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'X', 'Y', 'Z'}
2743
2744
    def r_cost(ch1, ch2):
2745
        """Return r(a,b) according to Zobel & Dart's definition."""
2746
        if ch1 == ch2:
2747
            return match_cost
2748
        if ch1 in all_letters and ch2 in all_letters:
2749
            for group in letter_groups:
2750
                if ch1 in group and ch2 in group:
2751
                    return group_cost
2752
        return mismatch_cost
2753
2754
    def d_cost(ch1, ch2):
2755
        """Return d(a,b) according to Zobel & Dart's definition."""
2756
        if ch1 != ch2 and (ch1 == 'H' or ch1 == 'W'):
0 ignored issues
show
Unused Code introduced by
Consider merging these comparisons with "in" to "ch1 in ('H', 'W')"
Loading history...
2757
            return group_cost
2758
        return r_cost(ch1, ch2)
2759
2760
    # convert both src & tar to NFKD normalized unicode
2761
    src = normalize('NFKD', text_type(src.upper()))
2762
    tar = normalize('NFKD', text_type(tar.upper()))
2763
    # convert ß to SS (for Python2)
2764
    src = src.replace('ß', 'SS')
2765
    tar = tar.replace('ß', 'SS')
2766
2767
    if src == tar:
2768
        return 0
2769
    if not src:
2770
        return len(tar) * mismatch_cost
2771
    if not tar:
2772
        return len(src) * mismatch_cost
2773
2774
    # pylint: disable=no-member
2775
    d_mat = np_zeros((len(src)+1, len(tar)+1), dtype=np_int)
2776
    # pylint: enable=no-member
2777
    lens = len(src)
2778
    lent = len(tar)
2779
    src = ' '+src
2780
    tar = ' '+tar
2781
2782
    if not local:
2783
        for i in range(1, lens+1):
2784
            d_mat[i, 0] = d_mat[i-1, 0] + d_cost(src[i-1], src[i])
2785
    for j in range(1, lent+1):
2786
        d_mat[0, j] = d_mat[0, j-1] + d_cost(tar[j-1], tar[j])
2787
2788
    for i in range(1, lens+1):
2789
        for j in range(1, lent+1):
2790
            d_mat[i, j] = min(d_mat[i-1, j] + d_cost(src[i-1], src[i]),
2791
                              d_mat[i, j-1] + d_cost(tar[j-1], tar[j]),
2792
                              d_mat[i-1, j-1] + r_cost(src[i], tar[j]))
2793
2794
    return d_mat[lens, lent]
2795
2796
2797
def dist_editex(src, tar, cost=(0, 1, 2), local=False):
2798
    """Return the normalized Editex distance between two strings.
2799
2800
    The Editex distance is normalized by dividing the Editex distance
2801
    (calculated by any of the three supported methods) by the greater of
2802
    the number of characters in src times the cost of a delete and
2803
    the number of characters in tar times the cost of an insert.
2804
    For the case in which all operations have :math:`cost = 1`, this is
2805
    equivalent to the greater of the length of the two strings src & tar.
2806
2807
    :param str src, tar: two strings to be compared
2808
    :param tuple cost: a 3-tuple representing the cost of the four possible
2809
        edits:
2810
        match, same-group, and mismatch respectively (by default: (0, 1, 2))
2811
    :param bool local: if True, the local variant of Editex is used
2812
    :returns: normalized Editex distance
2813
    :rtype: float
2814
2815
    >>> round(dist_editex('cat', 'hat'), 12)
2816
    0.333333333333
2817
    >>> round(dist_editex('Niall', 'Neil'), 12)
2818
    0.2
2819
    >>> dist_editex('aluminum', 'Catalan')
2820
    0.75
2821
    >>> dist_editex('ATCG', 'TAGC')
2822
    0.75
2823
    """
2824
    if src == tar:
2825
        return 0
2826
    mismatch_cost = cost[2]
2827
    return (editex(src, tar, cost, local) /
2828
            (max(len(src)*mismatch_cost, len(tar)*mismatch_cost)))
2829
2830
2831
def sim_editex(src, tar, cost=(0, 1, 2), local=False):
2832
    """Return the normalized Editex similarity of two strings.
2833
2834
    The Editex similarity is the complement of Editex distance:
2835
    :math:`sim_{Editex} = 1 - dist_{Editex}`.
2836
2837
    :param str src, tar: two strings to be compared
2838
    :param tuple cost: a 3-tuple representing the cost of the four possible
2839
        edits:
2840
        match, same-group, and mismatch respectively (by default: (0, 1, 2))
2841
    :param bool local: if True, the local variant of Editex is used
2842
    :returns: normalized Editex similarity
2843
    :rtype: float
2844
2845
    >>> round(sim_editex('cat', 'hat'), 12)
2846
    0.666666666667
2847
    >>> round(sim_editex('Niall', 'Neil'), 12)
2848
    0.8
2849
    >>> sim_editex('aluminum', 'Catalan')
2850
    0.25
2851
    >>> sim_editex('ATCG', 'TAGC')
2852
    0.25
2853
    """
2854
    return 1 - dist_editex(src, tar, cost, local)
2855
2856
2857
def eudex_hamming(src, tar, weights='exponential', maxlength=8,
2858
                  normalized=False):
2859
    """Calculate the Hamming distance between the Eudex hashes of two terms.
2860
2861
    Cf. :cite:`Ticki:2016`.
2862
2863
    - If weights is set to None, a simple Hamming distance is calculated.
2864
    - If weights is set to 'exponential', weight decays by powers of 2, as
2865
      proposed in the eudex specification: https://github.com/ticki/eudex.
2866
    - If weights is set to 'fibonacci', weight decays through the Fibonacci
2867
      series, as in the eudex reference implementation.
2868
    - If weights is set to a callable function, this assumes it creates a
2869
      generator and the generator is used to populate a series of weights.
2870
    - If weights is set to an iterable, the iterable's values should be
2871
      integers and will be used as the weights.
2872
2873
    :param str src, tar: two strings to be compared
2874
    :param iterable or generator function weights:
2875
    :param maxlength: the number of characters to encode as a eudex hash
2876
    :return:
2877
    """
2878
    def _gen_fibonacci():
2879
        """Yield the next Fibonacci number.
2880
2881
        Based on https://www.python-course.eu/generators.php
2882
        Starts at Fibonacci number 3 (the second 1)
2883
        """
2884
        num_a, num_b = 1, 2
2885
        while True:
2886
            yield num_a
2887
            num_a, num_b = num_b, num_a + num_b
2888
2889
    def _gen_exponential(base=2):
2890
        """Yield the next value in an exponential series of the base.
2891
2892
        Starts at base**0
2893
        """
2894
        exp = 0
2895
        while True:
2896
            yield base ** exp
2897
            exp += 1
2898
2899
    # Calculate the eudex hashes and XOR them
2900
    xored = eudex(src, maxlength=maxlength) ^ eudex(tar, maxlength=maxlength)
2901
2902
    # Simple hamming distance (all bits are equal)
2903
    if not weights:
2904
        return bin(xored).count('1')
2905
2906
    # If weights is a function, it should create a generator,
2907
    # which we now use to populate a list
2908
    if callable(weights):
2909
        weights = weights()
2910
    elif weights == 'exponential':
2911
        weights = _gen_exponential()
2912
    elif weights == 'fibonacci':
2913
        weights = _gen_fibonacci()
2914
    if isinstance(weights, GeneratorType):
2915
        weights = [next(weights) for _ in range(maxlength)][::-1]
2916
2917
    # Sum the weighted hamming distance
2918
    dist = 0
0 ignored issues
show
Comprehensibility Bug introduced by
dist is re-defining a name which is already available in the outer-scope (previously defined on line 3821).

It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior:

param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
Loading history...
2919
    maxdist = 0
2920
    while (xored or normalized) and weights:
2921
        maxdist += 8*weights[-1]
2922
        dist += bin(xored & 0xFF).count('1') * weights.pop()
2923
        xored >>= 8
2924
2925
    if normalized:
2926
        dist /= maxdist
2927
2928
    return dist
2929
2930
2931
def dist_eudex(src, tar, weights='exponential', maxlength=8):
2932
    """Return normalized Hamming distance between Eudex hashes of two terms.
2933
2934
    This is Eudex distance normalized to [0, 1].
2935
2936
    :param str src, tar: two strings to be compared
2937
    :param iterable or generator function weights:
2938
    :param maxlength: the number of characters to encode as a eudex hash
2939
    :return:
2940
    """
2941
    return eudex_hamming(src, tar, weights, maxlength, True)
2942
2943
2944
def sim_eudex(src, tar, weights='exponential', maxlength=8):
2945
    """Return normalized Hamming similarity between Eudex hashes of two terms.
2946
2947
    Normalized Eudex similarity is the complement of normalized Eudex distance:
2948
    :math:`sim_{Eudex} = 1 - dist_{Eudex}`.
2949
2950
    :param str src, tar: two strings to be compared
2951
    :param iterable or generator function weights:
2952
    :param maxlength: the number of characters to encode as a eudex hash
2953
    :return:
2954
    """
2955
    return 1-dist_eudex(src, tar, weights, maxlength)
2956
2957
2958
def sift4_simplest(src, tar, max_offset=0):
2959
    """Return the "simplest" Sift4 distance between two terms.
2960
2961
    This is an approximation of edit distance, described in
2962
    :cite:`Zackwehdex:2014`.
2963
2964
    :param str src, tar: two strings to be compared
2965
    :param max_offset: the number of characters to search for matching letters
2966
    :return:
2967
    """
2968
    if not src:
2969
        return len(tar)
2970
2971
    if not tar:
2972
        return len(src)
2973
2974
    src_len = len(src)
2975
    tar_len = len(tar)
2976
2977
    src_cur = 0
2978
    tar_cur = 0
2979
    lcss = 0
2980
    local_cs = 0
2981
2982
    while (src_cur < src_len) and (tar_cur < tar_len):
2983
        if src[src_cur] == tar[tar_cur]:
2984
            local_cs += 1
2985
        else:
2986
            lcss += local_cs
2987
            local_cs = 0
2988
            if src_cur != tar_cur:
2989
                src_cur = tar_cur = max(src_cur, tar_cur)
2990
            for i in range(max_offset):
2991
                if not ((src_cur+i < src_len) or (tar_cur+i < tar_len)):
2992
                    break
2993
                if (src_cur+i < src_len) and (src[src_cur+i] == tar[tar_cur]):
2994
                    src_cur += i
2995
                    local_cs += 1
2996
                    break
2997
                if (tar_cur+i < tar_len) and (src[src_cur] == tar[tar_cur+i]):
2998
                    tar_cur += i
2999
                    local_cs += 1
3000
                    break
3001
3002
        src_cur += 1
3003
        tar_cur += 1
3004
3005
    lcss += local_cs
3006
    return round(max(src_len, tar_len) - lcss)
3007
3008
3009
def sift4_common(src, tar, max_offset=0, max_distance=0):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (16/15).
Loading history...
3010
    """Return the "common" Sift4 distance between two terms.
3011
3012
    This is an approximation of edit distance, described in
3013
    :cite:`Zackwehdex:2014`.
3014
3015
    :param str src, tar: two strings to be compared
3016
    :param max_offset: the number of characters to search for matching letters
3017
    :param max_distance: the distance at which to stop and exit
3018
    :return:
3019
    """
3020
    if not src:
3021
        return len(tar)
3022
3023
    if not tar:
3024
        return len(src)
3025
3026
    src_len = len(src)
3027
    tar_len = len(tar)
3028
3029
    src_cur = 0
3030
    tar_cur = 0
3031
    lcss = 0
3032
    local_cs = 0
3033
    trans = 0
3034
    offset_arr = []
3035
3036
    while (src_cur < src_len) and (tar_cur < tar_len):
3037
        if src[src_cur] == tar[tar_cur]:
3038
            local_cs += 1
3039
            is_trans = False
3040
            i = 0
3041
            while i < len(offset_arr):
3042
                ofs = offset_arr[i]
3043
                if src_cur <= ofs['src_cur'] or tar_cur <= ofs['tar_cur']:
3044
                    is_trans = (abs(tar_cur-src_cur) >=
3045
                                abs(ofs['tar_cur']-ofs['src_cur']))
3046
                    if is_trans:
3047
                        trans += 1
3048
                    elif not ofs['trans']:
3049
                        ofs['trans'] = True
3050
                        trans += 1
3051
                    break
3052
                elif src_cur > ofs['tar_cur'] and tar_cur > ofs['src_cur']:
3053
                    del offset_arr[i]
3054
                else:
3055
                    i += 1
3056
3057
            offset_arr.append({'src_cur': src_cur, 'tar_cur': tar_cur,
3058
                               'trans': is_trans})
3059
        else:
3060
            lcss += local_cs
3061
            local_cs = 0
3062
            if src_cur != tar_cur:
3063
                src_cur = tar_cur = min(src_cur, tar_cur)
3064
            for i in range(max_offset):
3065
                if not ((src_cur+i < src_len) or (tar_cur+i < tar_len)):
3066
                    break
3067
                if (src_cur+i < src_len) and (src[src_cur+i] == tar[tar_cur]):
3068
                    src_cur += i-1
3069
                    tar_cur -= 1
3070
                    break
3071
                if (tar_cur+i < tar_len) and (src[src_cur] == tar[tar_cur+i]):
3072
                    src_cur -= 1
3073
                    tar_cur += i-1
3074
                    break
3075
3076
        src_cur += 1
3077
        tar_cur += 1
3078
3079
        if max_distance:
3080
            temporary_distance = max(src_cur, tar_cur) - lcss + trans
3081
            if temporary_distance >= max_distance:
3082
                return round(temporary_distance)
3083
3084
        if (src_cur >= src_len) or (tar_cur >= tar_len):
3085
            lcss += local_cs
3086
            local_cs = 0
3087
            src_cur = tar_cur = min(src_cur, tar_cur)
3088
3089
    lcss += local_cs
3090
    return round(max(src_len, tar_len) - lcss + trans)
3091
3092
3093
def dist_sift4(src, tar, max_offset=0, max_distance=0):
3094
    """Return the normalized "common" Sift4 distance between two terms.
3095
3096
    This is Sift4 distance, normalized to [0, 1].
3097
3098
    :param str src, tar: two strings to be compared
3099
    :param max_offset: the number of characters to search for matching letters
3100
    :param max_distance: the distance at which to stop and exit
3101
    :return:
3102
    """
3103
    return (sift4_common(src, tar, max_offset, max_distance) /
3104
            (max(len(src), len(tar))))
3105
3106
3107
def sim_sift4(src, tar, max_offset=0, max_distance=0):
3108
    """Return the normalized "common" Sift4 similarity of two terms.
3109
3110
    Normalized Sift4 similarity is the complement of normalized Sift4 distance:
3111
    :math:`sim_{Sift4} = 1 - dist_{Sift4}`.
3112
3113
    :param str src, tar: two strings to be compared
3114
    :param max_offset: the number of characters to search for matching letters
3115
    :param max_distance: the distance at which to stop and exit
3116
    :return:
3117
    """
3118
    return 1-dist_sift4(src, tar, max_offset, max_distance)
3119
3120
3121
def sim_baystat(src, tar, min_ss_len=None, left_ext=None, right_ext=None):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (16/15).
Loading history...
3122
    """Return the Baystat similarity.
3123
3124
    Good results for shorter words are reported when setting min_ss_len to 1
3125
    and either left_ext OR right_ext to 1.
3126
3127
    The Baystat similarity is defined in :cite:`Furnohr:2002`.
3128
3129
    This is ostensibly a port of the R module PPRL's implementation:
3130
    https://github.com/cran/PPRL/blob/master/src/MTB_Baystat.cpp
3131
    :cite:`Rukasz:2018`. As such, this could be made more pythonic.
3132
3133
    :param str src, tar: two strings to be compared
3134
    :param int min_ss_len: minimum substring length to be considered
3135
    :param int left_ext: left-side extension length
3136
    :param int right_ext: right-side extension length
3137
    :rtype: float
3138
    :return: the Baystat similarity
3139
    """
3140
    if src == tar:
3141
        return 1
3142
    if not src or not tar:
3143
        return 0
3144
3145
    max_len = max(len(src), len(tar))
3146
3147
    if not (min_ss_len and left_ext and right_ext):
3148
        # These can be set via arguments to the function. Otherwise they are
3149
        # set automatically based on values from the article.
3150
        if max_len >= 7:
3151
            min_ss_len = 2
3152
            left_ext = 2
3153
            right_ext = 2
3154
        else:
3155
            # The paper suggests that for short names, (exclusively) one or the
3156
            # other of left_ext and right_ext can be 1, with good results.
3157
            # I use 0 & 0 as the default in this case.
3158
            min_ss_len = 1
3159
            left_ext = 0
3160
            right_ext = 0
3161
3162
    pos = 0
3163
    match_len = 0
3164
3165
    while (True):
0 ignored issues
show
Unused Code Coding Style introduced by
There is an unnecessary parenthesis after while.
Loading history...
3166
        if pos + min_ss_len > len(src):
3167
            return match_len/max_len
3168
3169
        hit_len = 0
3170
        ix = 1
0 ignored issues
show
Coding Style Naming introduced by
Variable name "ix" doesn't conform to snake_case naming style ('(([a-z_][a-z0-9_]2,)|(_[a-z0-9_]*)|(__[a-z][a-z0-9_]+__))$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
3171
3172
        substring = src[pos:pos + min_ss_len]
3173
        search_begin = pos - left_ext
3174
3175
        if search_begin < 0:
3176
            search_begin = 0
3177
            left_ext_len = pos
3178
        else:
3179
            left_ext_len = left_ext
3180
3181
        if pos + min_ss_len + right_ext >= len(tar):
3182
            right_ext_len = len(tar) - pos - min_ss_len
3183
        else:
3184
            right_ext_len = right_ext
3185
3186
        if (search_begin + left_ext_len + min_ss_len + right_ext_len >
3187
                search_begin):
3188
            search_val = tar[search_begin:(search_begin + left_ext_len +
3189
                                           min_ss_len + right_ext_len)]
3190
        else:
3191
            search_val = ''
3192
3193
        flagged_tar = ''
3194
        while substring in search_val and pos + ix <= len(src):
3195
            hit_len = len(substring)
3196
            flagged_tar = tar.replace(substring, '#'*hit_len)
3197
3198
            if pos + min_ss_len + ix <= len(src):
3199
                substring = src[pos:pos + min_ss_len + ix]
3200
3201
            if pos+min_ss_len + right_ext_len + 1 <= len(tar):
3202
                right_ext_len += 1
3203
3204
            if (search_begin + left_ext_len + min_ss_len + right_ext_len <=
3205
                    len(tar)):
3206
                search_val = tar[search_begin:(search_begin + left_ext_len +
3207
                                               min_ss_len + right_ext_len)]
3208
3209
            ix += 1
0 ignored issues
show
Coding Style Naming introduced by
Variable name "ix" doesn't conform to snake_case naming style ('(([a-z_][a-z0-9_]2,)|(_[a-z0-9_]*)|(__[a-z][a-z0-9_]+__))$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
3210
3211
        if hit_len > 0:
3212
            tar = flagged_tar
3213
3214
        match_len += hit_len
3215
        pos += ix
3216
3217
3218
def dist_baystat(src, tar, min_ss_len=None, left_ext=None, right_ext=None):
3219
    """Return the Baystat distance.
3220
3221
    Normalized Baystat similarity is the complement of normalized Baystat
3222
    distance: :math:`sim_{Baystat} = 1 - dist_{Baystat}`.
3223
3224
    :param str src, tar: two strings to be compared
3225
    :param int min_ss_len: minimum substring length to be considered
3226
    :param int left_ext: left-side extension length
3227
    :param int right_ext: right-side extension length
3228
    :rtype: float
3229
    :return: the Baystat distance
3230
    """
3231
    return 1-sim_baystat(src, tar, min_ss_len, left_ext, right_ext)
3232
3233
3234
def typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5), layout='QWERTY'):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (24/15).
Loading history...
3235
    """Return the typo distance between two strings.
3236
3237
    This is inspired by Typo-Distance :cite:`Song:2011`, and a fair bit of
3238
    this was copied from that module. Compared to the original, this supports
3239
    different metrics for substitution.
3240
3241
    :param str src, tar: two strings to be compared
3242
    :param str metric: supported values include: 'euclidean', 'manhattan',
3243
          'log-euclidean', and 'log-manhattan'
3244
    :param tuple cost: a 4-tuple representing the cost of the four possible
3245
        edits: inserts, deletes, substitutions, and shift, respectively (by
3246
        default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be
3247
        significantly less than the cost of an insertion & deletion unless
3248
        a log metric is used.
3249
    :return: typo distance
3250
    :rtype: float
3251
    """
3252
    ins_cost, del_cost, sub_cost, shift_cost = cost
3253
3254
    if src == tar:
3255
        return 0.0
3256
    if not src:
3257
        return len(tar) * ins_cost
3258
    if not tar:
3259
        return len(src) * del_cost
3260
3261
    kbs = {'QWERTY': (
3262
        (('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '-', '='),
3263
         ('', 'q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '[', ']',
3264
          '\\'),
3265
         ('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', ';', '\''),
3266
         ('', 'z', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '/')),
3267
        (('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+'),
3268
         ('', 'Q', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '{', '}', '|'),
3269
         ('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', ':', '"'),
3270
         ('', 'Z', 'X', 'C', 'V', 'B', 'N', 'M', '<', '>', '?'))
3271
    ), 'Dvorak': (
3272
        (('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '[', ']'),
3273
         ('', '\'', ',', '.', 'p', 'y', 'f', 'g', 'c', 'r', 'l', '/', '=',
3274
          '\\'),
3275
         ('', 'a', 'o', 'e', 'u', 'i', 'd', 'h', 't', 'n', 's', '-'),
3276
         ('', ';', 'q', 'j', 'k', 'x', 'b', 'm', 'w', 'v', 'z')),
3277
        (('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '{', '}'),
3278
         ('', '"', '<', '>', 'P', 'Y', 'F', 'G', 'C', 'R', 'L', '?', '+', '|'),
3279
         ('', 'A', 'O', 'E', 'U', 'I', 'D', 'H', 'T', 'N', 'S', '_'),
3280
         ('', ':', 'Q', 'J', 'K', 'X', 'B', 'M', 'W', 'V', 'Z'))
3281
    ), 'AZERTY': (
3282
        (('²', '&', 'é', '"', '\'', '(', '-', 'è', '_', 'ç', 'à', ')', '='),
3283
         ('', 'a', 'z', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '', '$'),
3284
         ('', 'q', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'ù', '*'),
3285
         ('<', 'w', 'x', 'c', 'v', 'b', 'n', ',', ';', ':', '!')),
3286
        (('~', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '°', '+'),
3287
         ('', 'A', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '', '£'),
3288
         ('', 'Q', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'Ù', 'μ'),
3289
         ('>', 'W', 'X', 'C', 'V', 'B', 'N', '?', '.', '/', '§'))
3290
    ), 'QWERTZ': (
3291
        (('', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'ß', ''),
3292
         ('', 'q', 'w', 'e', 'r', 't', 'z', 'u', 'i', 'o', 'p', ' ü', '+',
3293
          '\\'),
3294
         ('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'ö', 'ä', '#'),
3295
         ('<', 'y', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '-')),
3296
        (('°', '!', '"', '§', '$', '%', '&', '/', '(', ')', '=', '?', ''),
3297
         ('', 'Q', 'W', 'E', 'R', 'T', 'Z', 'U', 'I', 'O', 'P', 'Ü', '*', ''),
3298
         ('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Ö', 'Ä', '\''),
3299
         ('>', 'Y', 'X', 'C', 'V', 'B', 'N', 'M', ';', ':', '_'))
3300
    )}
3301
3302
    keyboard = kbs[layout]
3303
    lowercase = {item for sublist in keyboard[0] for item in sublist}
3304
    uppercase = {item for sublist in keyboard[1] for item in sublist}
3305
3306
    def _kb_array_for_char(char):
3307
        """Return the keyboard layout that contains ch."""
3308
        if char in lowercase:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
3309
            return keyboard[0]
3310
        elif char in uppercase:
3311
            return keyboard[1]
3312
        else:
3313
            raise ValueError(char + ' not found in any keyboard layouts')
3314
3315
    def _get_char_coord(char, keyboard):
3316
        """Return the row & column of char in the keyboard."""
3317
        for row in keyboard:
3318
            if char in row:
3319
                return keyboard.index(row), row.index(char)
3320
        raise ValueError(char + ' not found in given keyboard layout')
3321
3322
    def _euclidean_keyboard_distance(char1, char2):
3323
        row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1))
3324
        row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2))
3325
        return ((row1 - row2) ** 2 + (col1 - col2) ** 2) ** 0.5
3326
3327
    def _manhattan_keyboard_distance(char1, char2):
3328
        row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1))
3329
        row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2))
3330
        return abs(row1 - row2) + abs(col1 - col2)
3331
3332
    def _log_euclidean_keyboard_distance(char1, char2):
3333
        return log(1 + _euclidean_keyboard_distance(char1, char2))
3334
3335
    def _log_manhattan_keyboard_distance(char1, char2):
3336
        return log(1 + _manhattan_keyboard_distance(char1, char2))
3337
3338
    metric_dict = {'euclidean': _euclidean_keyboard_distance,
3339
                   'manhattan': _manhattan_keyboard_distance,
3340
                   'log-euclidean': _log_euclidean_keyboard_distance,
3341
                   'log-manhattan': _log_manhattan_keyboard_distance}
3342
3343
    def substitution_cost(char1, char2):
3344
        cost = sub_cost
3345
        cost *= (metric_dict[metric](char1, char2) +
3346
                 shift_cost * (_kb_array_for_char(char1) !=
3347
                               _kb_array_for_char(char2)))
3348
        return cost
3349
3350
    d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32)
3351
    for i in range(len(src) + 1):
3352
        d_mat[i, 0] = i * del_cost
3353
    for j in range(len(tar) + 1):
3354
        d_mat[0, j] = j * ins_cost
3355
3356
    for i in range(len(src)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
3357
        for j in range(len(tar)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
3358
            d_mat[i + 1, j + 1] = min(
3359
                d_mat[i + 1, j] + ins_cost,  # ins
3360
                d_mat[i, j + 1] + del_cost,  # del
3361
                d_mat[i, j] + (substitution_cost(src[i], tar[j])
3362
                               if src[i] != tar[j] else 0)  # sub/==
3363
            )
3364
3365
    return d_mat[len(src), len(tar)]
3366
3367
3368
def dist_typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5)):
3369
    """Return the normalized typo distance between two strings.
3370
3371
    This is typo distance, normalized to [0, 1].
3372
3373
    :param str src, tar: two strings to be compared
3374
    :param str metric: supported values include: 'euclidean', 'manhattan',
3375
          'log-euclidean', and 'log-manhattan'
3376
    :param tuple cost: a 4-tuple representing the cost of the four possible
3377
        edits: inserts, deletes, substitutions, and shift, respectively (by
3378
        default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be
3379
        significantly less than the cost of an insertion & deletion unless
3380
        a log metric is used.
3381
    :return: normalized typo distance
3382
    :rtype: float
3383
    """
3384
    if src == tar:
3385
        return 0
3386
    ins_cost, del_cost = cost[:2]
3387
    return (typo(src, tar, metric, cost) /
3388
            (max(len(src)*del_cost, len(tar)*ins_cost)))
3389
3390
3391
def sim_typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5)):
3392
    """Return the normalized typo similarity between two strings.
3393
3394
    Normalized typo similarity is the complement of normalized typo distance:
3395
    :math:`sim_{typo} = 1 - dist_{typo}`.
3396
3397
    :param str src, tar: two strings to be compared
3398
    :param str metric: supported values include: 'euclidean', 'manhattan',
3399
          'log-euclidean', and 'log-manhattan'
3400
    :param tuple cost: a 4-tuple representing the cost of the four possible
3401
        edits: inserts, deletes, substitutions, and shift, respectively (by
3402
        default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be
3403
        significantly less than the cost of an insertion & deletion unless
3404
        a log metric is used.
3405
    :return: normalized typo similarity
3406
    :rtype: float
3407
    """
3408
    return 1 - dist_typo(src, tar, metric, cost)
3409
3410
3411
def dist_indel(src, tar):
3412
    """Return the indel distance between two strings.
3413
3414
    This is equivalent to levenshtein distance, when only inserts and deletes
3415
    are possible.
3416
3417
    :param str src, tar: two strings to be compared
3418
    :return: indel distance
3419
    :rtype: float
3420
    """
3421
    return dist_levenshtein(src, tar, mode='lev', cost=(1, 1, 9999, 9999))
3422
3423
3424
def sim_indel(src, tar):
3425
    """Return the indel similarity of two strings.
3426
3427
    Normalized bag similarity is the complement of normalized bag distance:
3428
    :math:`sim_{bag} = 1 - dist_{bag}`
3429
3430
    :param str src, tar: two strings to be compared
3431
    :return: indel similarity
3432
    :rtype: float
3433
    """
3434
    return sim_levenshtein(src, tar, mode='lev', cost=(1, 1, 9999, 9999))
3435
3436
3437
def _synoname_strip_punct(word):
3438
    """Return a word with punctuation stripped out.
3439
3440
    :param word:
3441
    :return:
3442
    """
3443
    stripped = ''
3444
    for char in word:
3445
        if char not in set(',-./:;"&\'()!{|}?$%*+<=>[\\]^_`~'):
3446
            stripped += char
3447
    return stripped.strip()
3448
3449
3450
def synoname_word_approximation(src_ln, tar_ln, src_fn='', tar_fn='',
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (30/15).
Loading history...
best-practice introduced by
Too many return statements (10/6)
Loading history...
3451
                                features=None):
3452
    """Return the Synoname word approximation score for two names.
3453
3454
    :param str src_ln, tar_ln: last names of the source and target
3455
    :param str src_fn, tar_fn: first names of the source and target (optional)
3456
    :param features: a dict containing special features calculated via
3457
        fingerprint.synoname_toolcode() (optional)
3458
    :returns: The word approximation score
3459
    :rtype: float
3460
    """
3461
    if features is None:
3462
        features = {}
3463
    if 'src_specials' not in features:
3464
        features['src_specials'] = []
3465
    if 'tar_specials' not in features:
3466
        features['tar_specials'] = []
3467
3468
    src_len_specials = len(features['src_specials'])
3469
    tar_len_specials = len(features['tar_specials'])
3470
3471
    # 1
3472
    if ('gen_conflict' not in features or features['gen_conflict'] or
3473
            'roman_conflict' not in features or features['roman_conflict']):
3474
        return 0
3475
3476
    # 3 & 7
3477
    full_tar1 = ' '.join((tar_ln, tar_fn)).replace('-', ' ')
3478
    for s_type, s_pos in features['tar_specials']:
3479
        if s_pos == 'a':
3480
            full_tar1 = full_tar1[:1+len(_synoname_special_table[s_type][1])]
3481
        elif s_pos == 'b':
3482
            loc = full_tar1.find(' '+_synoname_special_table[s_type][1]+' ')+1
3483
            full_tar1 = (full_tar1[:loc] +
3484
                         full_tar1[loc +
3485
                                   len(_synoname_special_table[s_type][1]):])
3486
        elif s_pos == 'c':
3487
            full_tar1 = full_tar1[1+len(_synoname_special_table[s_type][1]):]
3488
3489
    full_src1 = ' '.join((src_ln, src_fn)).replace('-', ' ')
3490
    for s_type, s_pos in features['src_specials']:
3491
        if s_pos == 'a':
3492
            full_src1 = full_src1[:1+len(_synoname_special_table[s_type][1])]
3493
        elif s_pos == 'b':
3494
            loc = full_src1.find(' '+_synoname_special_table[s_type][1]+' ')+1
3495
            full_src1 = (full_src1[:loc] +
3496
                         full_src1[loc +
3497
                                   len(_synoname_special_table[s_type][1]):])
3498
        elif s_pos == 'c':
3499
            full_src1 = full_src1[1+len(_synoname_special_table[s_type][1]):]
3500
3501
    full_tar2 = full_tar1
3502
    for s_type, s_pos in features['tar_specials']:
3503
        if s_pos == 'd':
3504
            full_tar2 = full_tar2[len(_synoname_special_table[s_type][1]):]
3505
        elif s_pos == 'X' and _synoname_special_table[s_type][1] in full_tar2:
3506
            loc = full_tar2.find(' '+_synoname_special_table[s_type][1]+' ')+1
3507
            full_tar2 = (full_tar2[:loc] +
3508
                         full_tar2[loc +
3509
                                   len(_synoname_special_table[s_type][1]):])
3510
3511
    full_src2 = full_tar1
3512
    for s_type, s_pos in features['src_specials']:
3513
        if s_pos == 'd':
3514
            full_src2 = full_src2[len(_synoname_special_table[s_type][1]):]
3515
        elif s_pos == 'X' and _synoname_special_table[s_type][1] in full_src2:
3516
            loc = full_src2.find(' '+_synoname_special_table[s_type][1]+' ')+1
3517
            full_src2 = (full_src2[:loc] +
3518
                         full_src2[loc +
3519
                                   len(_synoname_special_table[s_type][1]):])
3520
3521
    full_tar1 = _synoname_strip_punct(full_tar1)
3522
    tar1_words = full_tar1.split()
3523
    tar1_num_words = len(tar1_words)
3524
3525
    full_src1 = _synoname_strip_punct(full_src1)
3526
    src1_words = full_src1.split()
3527
    src1_num_words = len(src1_words)
3528
3529
    full_tar2 = _synoname_strip_punct(full_tar2)
3530
    tar2_words = full_tar2.split()
3531
    tar2_num_words = len(tar2_words)
3532
3533
    full_src2 = _synoname_strip_punct(full_src2)
3534
    src2_words = full_src2.split()
3535
    src2_num_words = len(src2_words)
3536
3537
    # 2
3538
    if (src1_num_words < 2 and src_len_specials == 0 and src2_num_words < 2 and
3539
            tar_len_specials == 0):
3540
        return 0
3541
3542
    # 4
3543
    if (tar1_num_words == 1 and src1_num_words == 1 and
3544
            tar1_words[0] == src1_words[0]):
3545
        return 1
3546
    if tar1_num_words < 2 and tar_len_specials == 0:
3547
        return 0
3548
3549
    # 5
3550
    last_found = False
3551
    for word in tar1_words:
3552
        if src_ln.endswith(word) or word+' ' in src_ln:
3553
            last_found = True
3554
3555
    if not last_found:
3556
        for word in src1_words:
3557
            if tar_ln.endswith(word) or word+' ' in tar_ln:
3558
                last_found = True
3559
3560
    # 6
3561
    matches = 0
3562
    if last_found:
3563
        for i, s_word in enumerate(src1_words):
3564
            for j, t_word in enumerate(tar1_words):
3565
                if s_word == t_word:
3566
                    src1_words[i] = '@'
3567
                    tar1_words[j] = '@'
3568
                    matches += 1
3569
    w_ratio = matches/max(tar1_num_words, src1_num_words)
3570
    if matches > 1 or (matches == 1 and
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (6/5)
Loading history...
3571
                       src1_num_words == 1 and tar1_num_words == 1 and
3572
                       (tar_len_specials > 0 or src_len_specials > 0)):
3573
        return w_ratio
3574
3575
    # 8
3576
    if (tar2_num_words == 1 and src2_num_words == 1 and
3577
            tar2_words[0] == src2_words[0]):
3578
        return 1
3579
    if tar2_num_words < 2 and tar_len_specials == 0:
3580
        return 0
3581
3582
    # 9
3583
    last_found = False
3584
    for word in tar2_words:
3585
        if src_ln.endswith(word) or word+' ' in src_ln:
3586
            last_found = True
3587
3588
    if not last_found:
3589
        for word in src2_words:
3590
            if tar_ln.endswith(word) or word+' ' in tar_ln:
3591
                last_found = True
3592
3593
    if not last_found:
3594
        return 0
3595
3596
    # 10
3597
    matches = 0
3598
    if last_found:
3599
        for i, s_word in enumerate(src2_words):
3600
            for j, t_word in enumerate(tar2_words):
3601
                if s_word == t_word:
3602
                    src2_words[i] = '@'
3603
                    tar2_words[j] = '@'
3604
                    matches += 1
3605
    w_ratio = matches/max(tar2_num_words, src2_num_words)
3606
    if matches > 1 or (matches == 1 and
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (6/5)
Loading history...
3607
                       src2_num_words == 1 and tar2_num_words == 1 and
3608
                       (tar_len_specials > 0 or src_len_specials > 0)):
3609
        return w_ratio
3610
3611
    return 0
3612
3613
3614
def synoname(src, tar, word_approx_min=0.3, char_approx_min=0.73,
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (41/15).
Loading history...
best-practice introduced by
Too many return statements (17/6)
Loading history...
3615
             tests=2**11-1):
3616
    """Return the Synoname similarity type of two words.
3617
3618
    Cf. :cite:`Getty:1991,Gross:1991`
3619
3620
    :param str src, tar: two strings to be compared
3621
    :return: Synoname value
3622
    :rtype: int
3623
    """
3624
    test_dict = {val: n**2 for n, val in enumerate([
3625
        'exact', 'omission', 'substitution', 'transposition', 'punctuation',
3626
        'initials', 'extended', 'inclusion', 'no_first', 'word_approx',
3627
        'confusions', 'char_approx'])}
3628
    match_type_dict = {val: n for n, val in enumerate([
3629
        'exact', 'omission', 'substitution', 'transposition', 'punctuation',
3630
        'initials', 'extended', 'inclusion', 'no_first', 'word_approx',
3631
        'confusions', 'char_approx', 'no_match'], 1)}
3632
3633
    if isinstance(tests, Iterable):
3634
        new_tests = 0
3635
        for term in tests:
3636
            if term in test_dict:
3637
                new_tests += test_dict[term]
3638
        tests = new_tests
3639
3640
    if isinstance(src, tuple):
3641
        src_ln, src_fn, src_qual = src
3642
    else:
3643
        src_ln, src_fn, src_qual = src.split('#')[1:4]
3644
    if isinstance(tar, tuple):
3645
        tar_ln, tar_fn, tar_qual = tar
3646
    else:
3647
        tar_ln, tar_fn, tar_qual = tar.split('#')[1:4]
3648
3649
    def split_special(spec):
3650
        spec_list = []
3651
        while spec:
3652
            spec_list.append((int(spec[:3]), spec[3:4]))
3653
            spec = spec[4:]
3654
        return spec_list
3655
3656
    # 1. Preprocessing
3657
3658
    # Lowercasing
3659
    src_fn = src_fn.strip().lower()
3660
    src_ln = src_ln.strip().lower()
3661
    src_qual = src_qual.strip().lower()
3662
3663
    tar_fn = tar_fn.strip().lower()
3664
    tar_ln = tar_ln.strip().lower()
3665
    tar_qual = tar_qual.strip().lower()
3666
3667
    # Create toolcodes
3668
    src_fn, src_ln, src_tc = synoname_toolcode(src_fn, src_ln, src_qual)
3669
    tar_fn, tar_ln, tar_tc = synoname_toolcode(tar_fn, tar_ln, tar_qual)
3670
3671
    src_generation = int(src_tc[2])
3672
    src_romancode = int(src_tc[3:6])
3673
    src_len_fn = int(src_tc[6:8])
3674
    src_tc = src_tc.split('#')
3675
    src_specials = split_special(src_tc[1])
3676
3677
    tar_generation = int(tar_tc[2])
3678
    tar_romancode = int(tar_tc[3:6])
3679
    tar_len_fn = int(tar_tc[6:8])
3680
    tar_tc = tar_tc.split('#')
3681
    tar_specials = split_special(tar_tc[1])
3682
3683
    gen_conflict = (src_generation != tar_generation and
3684
                    (src_generation or tar_generation))
3685
    roman_conflict = (src_romancode != tar_romancode and
3686
                      (src_romancode or tar_romancode))
3687
3688
    ln_equal = src_ln == tar_ln
3689
    fn_equal = src_fn == tar_fn
3690
3691
    # approx_c
3692
    def approx_c():
3693
        if gen_conflict or roman_conflict:
3694
            return 0
3695
3696
        full_src = ' '.join((src_ln, src_fn))
3697
        if full_src.startswith('master '):
3698
            full_src = full_src[len('master '):]
3699
            for intro in ['of the ', 'of ', 'known as the ', 'with the ',
3700
                          'with ']:
3701
                if full_src.ssrctswith(intro):
0 ignored issues
show
Bug introduced by
The Instance of str does not seem to have a member named ssrctswith.

This check looks for calls to members that are non-existent. These calls will fail.

The member could have been renamed or removed.

Loading history...
3702
                    full_src = full_src[len(intro):]
3703
3704
        full_tar = ' '.join((tar_ln, tar_fn))
3705
        if full_tar.startswith('master '):
3706
            full_tar = full_tar[len('master '):]
3707
            for intro in ['of the ', 'of ', 'known as the ', 'with the ',
3708
                          'with ']:
3709
                if full_tar.startswith(intro):
3710
                    full_tar = full_tar[len(intro):]
3711
3712
        ca_ratio = sim_ratcliff_obershelp(full_src, full_tar)
3713
        return ca_ratio >= char_approx_min, ca_ratio
3714
3715
    approx_c_result, ca_ratio = approx_c()
0 ignored issues
show
Unused Code introduced by
The variable approx_c_result seems to be unused.
Loading history...
3716
3717
    if tests & test_dict['exact'] and fn_equal and ln_equal:
3718
        return match_type_dict['exact']
3719
    if tests & test_dict['omission']:
3720
        if (fn_equal and
3721
                levenshtein(src_ln, tar_ln, cost=(1, 1, 99, 99)) == 1):
3722
            if not roman_conflict:
3723
                return match_type_dict['omission']
3724
        elif (ln_equal and
3725
              levenshtein(src_fn, tar_fn, cost=(1, 1, 99, 99)) == 1):
3726
            return match_type_dict['omission']
3727
    if tests & test_dict['substitution']:
3728
        if (fn_equal and
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
3729
                levenshtein(src_ln, tar_ln, cost=(99, 99, 1, 99)) == 1):
3730
            return match_type_dict['substitution']
3731
        elif (ln_equal and
3732
              levenshtein(src_fn, tar_fn, cost=(99, 99, 1, 99)) == 1):
3733
            return match_type_dict['substitution']
3734
    if tests & test_dict['transposition']:
3735
        if (fn_equal and
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
3736
                levenshtein(src_ln, tar_ln, cost=(99, 99, 99, 1)) == 1):
3737
            return match_type_dict['transposition']
3738
        elif (ln_equal and
3739
              levenshtein(src_fn, tar_fn, cost=(99, 99, 99, 1)) == 1):
3740
            return match_type_dict['transposition']
3741
    if tests & test_dict['punctuation']:
3742
        np_src_fn = _synoname_strip_punct(src_fn)
3743
        np_tar_fn = _synoname_strip_punct(tar_fn)
3744
        np_src_ln = _synoname_strip_punct(src_ln)
3745
        np_tar_ln = _synoname_strip_punct(tar_ln)
3746
3747
        if np_src_fn == np_tar_fn and np_src_ln == np_tar_ln:
3748
            return match_type_dict['punctuation']
3749
    if tests & test_dict['initials'] and ln_equal:
3750
        if src_fn or tar_fn:
3751
            src_initials = ''.join(_[0] for _ in src_fn.split())
3752
            tar_initials = ''.join(_[0] for _ in tar_fn.split())
3753
            if src_initials == tar_initials:
3754
                return match_type_dict['initials']
3755
            initial_diff = abs(len(src_initials)-len(tar_initials))
3756
            if (initial_diff and
3757
                    ((initial_diff == levenshtein(src_initials, tar_initials,
3758
                                                  cost=(1, 99, 99, 99))) or
3759
                     (initial_diff == levenshtein(tar_initials, src_initials,
3760
                                                  cost=(1, 99, 99, 99))))):
3761
                return match_type_dict['initials']
3762
    if tests & test_dict['extended']:
3763
        if src_ln[0] == tar_ln[0] and (src_ln.startswith(tar_ln) or
3764
                                       tar_ln.startswith(src_ln)):
3765
            if ((not src_len_fn and not tar_len_fn) or
3766
                    src_ln.startswith(tar_ln) or
3767
                    tar_ln.startswith(src_ln)) and not roman_conflict:
3768
                return match_type_dict['extended']
3769
    if tests & test_dict['inclusion'] and ln_equal:
3770
        if src_fn in tar_fn or tar_fn in src_ln:
3771
            return match_type_dict['inclusion']
3772
    if tests & test_dict['no_first'] and ln_equal:
3773
        if src_fn == '' or tar_fn == '':
3774
            return match_type_dict['no_first']
3775
    if tests & test_dict['word_approx']:
3776
        ratio = synoname_word_approximation(src_ln, tar_ln, src_fn, tar_fn,
3777
                                            {'gen_conflict': gen_conflict,
3778
                                             'roman_conflict': roman_conflict,
3779
                                             'src_specials': src_specials,
3780
                                             'tar_specials': tar_specials})
3781
        if ratio == 1 and tests & test_dict['confusions']:
3782
            if ' '.join((src_fn, src_ln)) == ' '.join((tar_fn, tar_ln)):
3783
                return match_type_dict['confusions']
3784
        if ratio >= word_approx_min:
3785
            return match_type_dict['word_approx']
3786
    if tests & test_dict['char_approx']:
3787
        if ca_ratio >= char_approx_min:
3788
            return match_type_dict['char_approx']
3789
    return match_type_dict['no_match']
3790
3791
3792
###############################################################################
3793
3794
3795
def sim(src, tar, method=sim_levenshtein):
3796
    """Return a similarity of two strings.
3797
3798
    This is a generalized function for calling other similarity functions.
3799
3800
    :param str src, tar: two strings to be compared
3801
    :param function method: specifies the similarity metric (Levenshtein by
3802
        default)
3803
    :returns: similarity according to the specified function
3804
    :rtype: float
3805
3806
    >>> round(sim('cat', 'hat'), 12)
3807
    0.666666666667
3808
    >>> round(sim('Niall', 'Neil'), 12)
3809
    0.4
3810
    >>> sim('aluminum', 'Catalan')
3811
    0.125
3812
    >>> sim('ATCG', 'TAGC')
3813
    0.25
3814
    """
3815
    if callable(method):
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
3816
        return method(src, tar)
3817
    else:
3818
        raise AttributeError('Unknown similarity function: ' + str(method))
3819
3820
3821
def dist(src, tar, method=sim_levenshtein):
3822
    """Return a distance between two strings.
3823
3824
    This is a generalized function for calling other distance functions.
3825
3826
    :param str src, tar: two strings to be compared
3827
    :param function method: specifies the similarity metric (Levenshtein by
3828
        default) -- Note that this takes a similarity metric function, not
3829
        a distance metric function.
3830
    :returns: distance according to the specified function
3831
    :rtype: float
3832
3833
    >>> round(dist('cat', 'hat'), 12)
3834
    0.333333333333
3835
    >>> round(dist('Niall', 'Neil'), 12)
3836
    0.6
3837
    >>> dist('aluminum', 'Catalan')
3838
    0.875
3839
    >>> dist('ATCG', 'TAGC')
3840
    0.75
3841
    """
3842
    if callable(method):
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
3843
        return 1 - method(src, tar)
3844
    else:
3845
        raise AttributeError('Unknown distance function: ' + str(method))
3846
3847
3848
if __name__ == '__main__':
3849
    import doctest
3850
    doctest.testmod()
3851