Completed
Push — master ( 81a74a...2bcbf2 )
by Chris
12:17
created

abydos.distance.sim_length()   A

Complexity

Conditions 5

Size

Total Lines 25
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 5
eloc 6
nop 2
dl 0
loc 25
rs 9.3333
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
0 ignored issues
show
coding-style introduced by
Too many lines in module (3707/1000)
Loading history...
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.distance.
20
21
The distance module implements string edit distance functions including:
22
23
    - Levenshtein distance
24
    - Optimal String Alignment distance
25
    - Levenshtein-Damerau distance
26
    - Hamming distance
27
    - Tversky index
28
    - Sørensen–Dice coefficient & distance
29
    - Jaccard similarity coefficient & distance
30
    - overlap similarity & distance
31
    - Tanimoto coefficient & distance
32
    - Minkowski distance & similarity
33
    - Manhattan distance & similarity
34
    - Euclidean distance & similarity
35
    - Chebyshev distance & similarity
36
    - cosine similarity & distance
37
    - Jaro distance
38
    - Jaro-Winkler distance (incl. the strcmp95 algorithm variant)
39
    - Longest common substring
40
    - Ratcliff-Obershelp similarity & distance
41
    - Match Rating Algorithm similarity
42
    - Normalized Compression Distance (NCD) & similarity
43
    - Monge-Elkan similarity & distance
44
    - Matrix similarity
45
    - Needleman-Wunsch score
46
    - Smither-Waterman score
47
    - Gotoh score
48
    - Length similarity
49
    - Prefix, Suffix, and Identity similarity & distance
50
    - Modified Language-Independent Product Name Search (MLIPNS) similarity &
51
      distance
52
    - Bag distance
53
    - Editex distance
54
    - Eudex distances
55
    - Sift4 distance
56
    - Baystat distance & similarity
57
    - Typo distance
58
    - Indel distance
59
60
Functions beginning with the prefixes 'sim' and 'dist' are guaranteed to be
61
in the range [0, 1], and sim_X = 1 - dist_X since the two are complements.
62
If a sim_X function is supplied identical src & tar arguments, it is guaranteed
63
to return 1; the corresponding dist_X function is guaranteed to return 0.
64
"""
65
66
from __future__ import division, unicode_literals
67
68
import codecs
69
import math
70
import sys
71
import types
72
import unicodedata
73
from collections import Counter, Iterable, defaultdict
74
75
import numpy as np
76
77
from six import text_type
78
from six.moves import range
79
80
from .compression import ac_encode, ac_train, rle_encode
81
from .fingerprint import synoname_toolcode
82
from .phonetic import eudex, mra
83
from .qgram import QGrams
84
85
try:
86
    import lzma
87
except ImportError:  # pragma: no cover
88
    # If the system lacks the lzma library, that's fine, but lzma comrpession
89
    # similarity won't be supported.
90
    pass
91
92
93
def levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
94
    """Return the Levenshtein distance between two strings.
95
96
    Levenshtein distance
97
98
    This is the standard edit distance measure. Cf.
99
    https://en.wikipedia.org/wiki/Levenshtein_distance
100
101
    Two additional variants: optimal string alignment (aka restricted
102
    Damerau-Levenshtein distance) and the Damerau-Levenshtein distance
103
    are also supported. Cf.
104
    https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
105
106
    The ordinary Levenshtein & Optimal String Alignment distance both
107
    employ the Wagner-Fischer dynamic programming algorithm. Cf.
108
    https://en.wikipedia.org/wiki/Wagner%E2%80%93Fischer_algorithm
109
110
    Levenshtein edit distance ordinarily has unit insertion, deletion, and
111
    substitution costs.
112
113
    :param str src, tar: two strings to be compared
114
    :param str mode: specifies a mode for computing the Levenshtein distance:
115
116
        - 'lev' (default) computes the ordinary Levenshtein distance,
117
          in which edits may include inserts, deletes, and substitutions
118
        - 'osa' computes the Optimal String Alignment distance, in which
119
          edits may include inserts, deletes, substitutions, and
120
          transpositions but substrings may only be edited once
121
        - 'dam' computes the Damerau-Levenshtein distance, in which
122
          edits may include inserts, deletes, substitutions, and
123
          transpositions and substrings may undergo repeated edits
124
125
    :param tuple cost: a 4-tuple representing the cost of the four possible
126
        edits: inserts, deletes, substitutions, and transpositions,
127
        respectively (by default: (1, 1, 1, 1))
128
    :returns: the Levenshtein distance between src & tar
129
    :rtype: int (may return a float if cost has float values)
130
131
    >>> levenshtein('cat', 'hat')
132
    1
133
    >>> levenshtein('Niall', 'Neil')
134
    3
135
    >>> levenshtein('aluminum', 'Catalan')
136
    7
137
    >>> levenshtein('ATCG', 'TAGC')
138
    3
139
140
    >>> levenshtein('ATCG', 'TAGC', mode='osa')
141
    2
142
    >>> levenshtein('ACTG', 'TAGC', mode='osa')
143
    4
144
145
    >>> levenshtein('ATCG', 'TAGC', mode='dam')
146
    2
147
    >>> levenshtein('ACTG', 'TAGC', mode='dam')
148
    3
149
    """
150
    ins_cost, del_cost, sub_cost, trans_cost = cost
151
152
    if src == tar:
153
        return 0
154
    if not src:
155
        return len(tar) * ins_cost
156
    if not tar:
157
        return len(src) * del_cost
158
159
    if 'dam' in mode:
160
        return damerau_levenshtein(src, tar, cost)
161
162
    # pylint: disable=no-member
163
    d_mat = np.zeros((len(src)+1, len(tar)+1), dtype=np.int)
164
    # pylint: enable=no-member
165
    for i in range(len(src)+1):
166
        d_mat[i, 0] = i * del_cost
167
    for j in range(len(tar)+1):
168
        d_mat[0, j] = j * ins_cost
169
170
    for i in range(len(src)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
171
        for j in range(len(tar)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
172
            d_mat[i+1, j+1] = min(
173
                d_mat[i+1, j] + ins_cost,  # ins
174
                d_mat[i, j+1] + del_cost,  # del
175
                d_mat[i, j] + (sub_cost if src[i] != tar[j] else 0)  # sub/==
176
            )
177
178
            if mode == 'osa':
179
                if ((i+1 > 1 and j+1 > 1 and src[i] == tar[j-1] and
180
                     src[i-1] == tar[j])):
181
                    # transposition
182
                    d_mat[i+1, j+1] = min(d_mat[i+1, j+1],
183
                                          d_mat[i-1, j-1] + trans_cost)
184
185
    return d_mat[len(src), len(tar)]
186
187
188
def dist_levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
189
    """Return the normalized Levenshtein distance between two strings.
190
191
    Levenshtein distance normalized to the interval [0, 1]
192
193
    The Levenshtein distance is normalized by dividing the Levenshtein distance
194
    (calculated by any of the three supported methods) by the greater of
195
    the number of characters in src times the cost of a delete and
196
    the number of characters in tar times the cost of an insert.
197
    For the case in which all operations have :math:`cost = 1`, this is
198
    equivalent to the greater of the length of the two strings src & tar.
199
200
    :param str src, tar: two strings to be compared
201
    :param str mode: specifies a mode for computing the Levenshtein distance:
202
203
        - 'lev' (default) computes the ordinary Levenshtein distance,
204
          in which edits may include inserts, deletes, and substitutions
205
        - 'osa' computes the Optimal String Alignment distance, in which
206
          edits may include inserts, deletes, substitutions, and
207
          transpositions but substrings may only be edited once
208
        - 'dam' computes the Damerau-Levenshtein distance, in which
209
          edits may include inserts, deletes, substitutions, and
210
          transpositions and substrings may undergo repeated edits
211
212
    :param tuple cost: a 4-tuple representing the cost of the four possible
213
        edits: inserts, deletes, substitutions, and transpositions,
214
        respectively (by default: (1, 1, 1, 1))
215
    :returns: normalized Levenshtein distance
216
    :rtype: float
217
218
    >>> dist_levenshtein('cat', 'hat')
219
    0.33333333333333331
220
    >>> dist_levenshtein('Niall', 'Neil')
221
    0.59999999999999998
222
    >>> dist_levenshtein('aluminum', 'Catalan')
223
    0.875
224
    >>> dist_levenshtein('ATCG', 'TAGC')
225
    0.75
226
    """
227
    if src == tar:
228
        return 0
229
    ins_cost, del_cost = cost[:2]
230
    return (levenshtein(src, tar, mode, cost) /
231
            (max(len(src)*del_cost, len(tar)*ins_cost)))
232
233
234
def sim_levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
235
    """Return the Levenshtein similarity of two strings.
236
237
    Levenshtein similarity normalized to the interval [0, 1]
238
239
    Levenshtein similarity the complement of Levenshtein distance:
240
    :math:`sim_{Levenshtein} = 1 - dist_{Levenshtein}`
241
242
    The arguments are identical to those of the levenshtein() function.
243
244
    :param str src, tar: two strings to be compared
245
    :param str mode: specifies a mode for computing the Levenshtein distance:
246
247
            - 'lev' (default) computes the ordinary Levenshtein distance,
248
              in which edits may include inserts, deletes, and substitutions
249
            - 'osa' computes the Optimal String Alignment distance, in which
250
              edits may include inserts, deletes, substitutions, and
251
              transpositions but substrings may only be edited once
252
            - 'dam' computes the Damerau-Levenshtein distance, in which
253
              edits may include inserts, deletes, substitutions, and
254
              transpositions and substrings may undergo repeated edits
255
256
    :param tuple cost: a 4-tuple representing the cost of the four possible
257
        edits:
258
        inserts, deletes, substitutions, and transpositions, respectively
259
        (by default: (1, 1, 1, 1))
260
    :returns: normalized Levenshtein similarity
261
    :rtype: float
262
263
    >>> sim_levenshtein('cat', 'hat')
264
    0.66666666666666674
265
    >>> sim_levenshtein('Niall', 'Neil')
266
    0.40000000000000002
267
    >>> sim_levenshtein('aluminum', 'Catalan')
268
    0.125
269
    >>> sim_levenshtein('ATCG', 'TAGC')
270
    0.25
271
    """
272
    return 1 - dist_levenshtein(src, tar, mode, cost)
273
274
275
def damerau_levenshtein(src, tar, cost=(1, 1, 1, 1)):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (20/15).
Loading history...
276
    """Return the Damerau-Levenshtein distance between two strings.
277
278
    Damerau-Levenshtein distance
279
280
    This computes the Damerau-Levenshtein distance. Cf.
281
    https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
282
283
    Damerau-Levenshtein code based on Java code by Kevin L. Stern,
284
    under the MIT license:
285
    https://github.com/KevinStern/software-and-algorithms/blob/master/src/main/java/blogspot/software_and_algorithms/stern_library/string/DamerauLevenshteinAlgorithm.java
286
287
    :param str src, tar: two strings to be compared
288
    :param tuple cost: a 4-tuple representing the cost of the four possible
289
        edits:
290
        inserts, deletes, substitutions, and transpositions, respectively
291
        (by default: (1, 1, 1, 1))
292
    :returns: the Damerau-Levenshtein distance between src & tar
293
    :rtype: int (may return a float if cost has float values)
294
295
    >>> damerau_levenshtein('cat', 'hat')
296
    1
297
    >>> damerau_levenshtein('Niall', 'Neil')
298
    3
299
    >>> damerau_levenshtein('aluminum', 'Catalan')
300
    7
301
    >>> damerau_levenshtein('ATCG', 'TAGC')
302
    2
303
    """
304
    ins_cost, del_cost, sub_cost, trans_cost = cost
305
306
    if src == tar:
307
        return 0
308
    if not src:
309
        return len(tar) * ins_cost
310
    if not tar:
311
        return len(src) * del_cost
312
313
    if 2*trans_cost < ins_cost + del_cost:
314
        raise ValueError('Unsupported cost assignment; the cost of two ' +
315
                         'transpositions must not be less than the cost of ' +
316
                         'an insert plus a delete.')
317
318
    # pylint: disable=no-member
319
    d_mat = (np.zeros((len(src))*(len(tar)), dtype=np.int).
320
             reshape((len(src), len(tar))))
321
    # pylint: enable=no-member
322
323
    if src[0] != tar[0]:
324
        d_mat[0, 0] = min(sub_cost, ins_cost + del_cost)
325
326
    src_index_by_character = {}
327
    src_index_by_character[src[0]] = 0
328
    for i in range(1, len(src)):
329
        del_distance = d_mat[i-1, 0] + del_cost
330
        ins_distance = (i+1) * del_cost + ins_cost
331
        match_distance = (i * del_cost +
332
                          (0 if src[i] == tar[0] else sub_cost))
333
        d_mat[i, 0] = min(del_distance, ins_distance, match_distance)
334
335
    for j in range(1, len(tar)):
336
        del_distance = (j+1) * ins_cost + del_cost
337
        ins_distance = d_mat[0, j-1] + ins_cost
338
        match_distance = (j * ins_cost +
339
                          (0 if src[0] == tar[j] else sub_cost))
340
        d_mat[0, j] = min(del_distance, ins_distance, match_distance)
341
342
    for i in range(1, len(src)):
343
        max_src_letter_match_index = (0 if src[i] == tar[0] else -1)
344
        for j in range(1, len(tar)):
345
            candidate_swap_index = (-1 if tar[j] not in
346
                                    src_index_by_character else
347
                                    src_index_by_character[tar[j]])
348
            j_swap = max_src_letter_match_index
349
            del_distance = d_mat[i-1, j] + del_cost
350
            ins_distance = d_mat[i, j-1] + ins_cost
351
            match_distance = d_mat[i-1, j-1]
352
            if src[i] != tar[j]:
353
                match_distance += sub_cost
354
            else:
355
                max_src_letter_match_index = j
356
357
            if candidate_swap_index != -1 and j_swap != -1:
358
                i_swap = candidate_swap_index
359
360
                if i_swap == 0 and j_swap == 0:
361
                    pre_swap_cost = 0
362
                else:
363
                    pre_swap_cost = d_mat[max(0, i_swap-1), max(0, j_swap-1)]
364
                swap_distance = (pre_swap_cost + (i - i_swap - 1) *
365
                                 del_cost + (j - j_swap - 1) * ins_cost +
366
                                 trans_cost)
367
            else:
368
                swap_distance = sys.maxsize
369
370
            d_mat[i, j] = min(del_distance, ins_distance,
371
                              match_distance, swap_distance)
372
        src_index_by_character[src[i]] = i
373
374
    return d_mat[len(src)-1, len(tar)-1]
375
376
377
def dist_damerau(src, tar, cost=(1, 1, 1, 1)):
378
    """Return the Damerau-Levenshtein similarity of two strings.
379
380
    Damerau-Levenshtein distance normalized to the interval [0, 1]
381
382
    The Damerau-Levenshtein distance is normalized by dividing the
383
    Damerau-Levenshtein distance by the greater of
384
    the number of characters in src times the cost of a delete and
385
    the number of characters in tar times the cost of an insert.
386
    For the case in which all operations have :math:`cost = 1`, this is
387
    equivalent to the greater of the length of the two strings src & tar.
388
389
    The arguments are identical to those of the levenshtein() function.
390
391
    :param str src, tar: two strings to be compared
392
    :param tuple cost: a 4-tuple representing the cost of the four possible
393
        edits:
394
        inserts, deletes, substitutions, and transpositions, respectively
395
        (by default: (1, 1, 1, 1))
396
    :returns: normalized Damerau-Levenshtein distance
397
    :rtype: float
398
399
    >>> dist_damerau('cat', 'hat')
400
    0.33333333333333331
401
    >>> dist_damerau('Niall', 'Neil')
402
    0.59999999999999998
403
    >>> dist_damerau('aluminum', 'Catalan')
404
    0.875
405
    >>> dist_damerau('ATCG', 'TAGC')
406
    0.5
407
    """
408
    if src == tar:
409
        return 0
410
    ins_cost, del_cost = cost[:2]
411
    return (damerau_levenshtein(src, tar, cost) /
412
            (max(len(src)*del_cost, len(tar)*ins_cost)))
413
414
415
def sim_damerau(src, tar, cost=(1, 1, 1, 1)):
416
    """Return the Damerau-Levenshtein similarity of two strings.
417
418
    Damerau-Levenshtein similarity normalized to the interval [0, 1]
419
420
    Damerau-Levenshtein similarity the complement of Damerau-Levenshtein
421
    distance:
422
    :math:`sim_{Damerau} = 1 - dist_{Damerau}`
423
424
    The arguments are identical to those of the levenshtein() function.
425
426
    :param str src, tar: two strings to be compared
427
    :param tuple cost: a 4-tuple representing the cost of the four possible
428
        edits:
429
        inserts, deletes, substitutions, and transpositions, respectively
430
        (by default: (1, 1, 1, 1))
431
    :returns: normalized Damerau-Levenshtein similarity
432
    :rtype: float
433
434
    >>> sim_damerau('cat', 'hat')
435
    0.66666666666666674
436
    >>> sim_damerau('Niall', 'Neil')
437
    0.40000000000000002
438
    >>> sim_damerau('aluminum', 'Catalan')
439
    0.125
440
    >>> sim_damerau('ATCG', 'TAGC')
441
    0.5
442
    """
443
    return 1 - dist_damerau(src, tar, cost)
444
445
446
def hamming(src, tar, difflens=True):
447
    """Return the Hamming distance between two strings.
448
449
    Hamming distance
450
451
    Hamming distance equals the number of character positions at which two
452
    strings differ. For strings of unequal lengths, it is not normally defined.
453
    By default, this implementation calculates the Hamming distance of the
454
    first n characters where n is the lesser of the two strings' lengths and
455
    adds to this the difference in string lengths.
456
457
    :param str src, tar: two strings to be compared
458
    :param bool allow_different_lengths:
459
        If True (default), this returns the Hamming distance for those
460
        characters that have a matching character in both strings plus the
461
        difference in the strings' lengths. This is equivalent to extending
462
        the shorter string with obligatorily non-matching characters.
463
        If False, an exception is raised in the case of strings of unequal
464
        lengths.
465
    :returns: the Hamming distance between src & tar
466
    :rtype: int
467
468
    >>> hamming('cat', 'hat')
469
    1
470
    >>> hamming('Niall', 'Neil')
471
    3
472
    >>> hamming('aluminum', 'Catalan')
473
    8
474
    >>> hamming('ATCG', 'TAGC')
475
    4
476
    """
477
    if not difflens and len(src) != len(tar):
478
        raise ValueError('Undefined for sequences of unequal length; set ' +
479
                         'difflens to True for Hamming distance between ' +
480
                         'strings of unequal lengths.')
481
482
    hdist = 0
483
    if difflens:
484
        hdist += abs(len(src)-len(tar))
485
    hdist += sum(c1 != c2 for c1, c2 in zip(src, tar))
486
487
    return hdist
488
489
490
def dist_hamming(src, tar, difflens=True):
491
    """Return the normalized Hamming distance between two strings.
492
493
    Hamming distance normalized to the interval [0, 1]
494
495
    The Hamming distance is normalized by dividing it
496
    by the greater of the number of characters in src & tar (unless difflens is
497
    set to False, in which case an exception is raised).
498
499
    The arguments are identical to those of the hamming() function.
500
501
    :param str src, tar: two strings to be compared
502
    :param bool allow_different_lengths:
503
        If True (default), this returns the Hamming distance for those
504
        characters that have a matching character in both strings plus the
505
        difference in the strings' lengths. This is equivalent to extending
506
        the shorter string with obligatorily non-matching characters.
507
        If False, an exception is raised in the case of strings of unequal
508
        lengths.
509
    :returns: normalized Hamming distance
510
    :rtype: float
511
512
    >>> dist_hamming('cat', 'hat')
513
    0.3333333333333333
514
    >>> dist_hamming('Niall', 'Neil')
515
    0.6
516
    >>> dist_hamming('aluminum', 'Catalan')
517
    1.0
518
    >>> dist_hamming('ATCG', 'TAGC')
519
    1.0
520
    """
521
    if src == tar:
522
        return 0
523
    return hamming(src, tar, difflens) / max(len(src), len(tar))
524
525
526
def sim_hamming(src, tar, difflens=True):
527
    """Return the normalized Hamming similarity of two strings.
528
529
    Hamming similarity normalized to the interval [0, 1]
530
531
    Hamming similarity is the complement of normalized Hamming distance:
532
    :math:`sim_{Hamming} = 1 - dist{Hamming}`
533
534
    Provided that difflens==True, the Hamming similarity is identical to the
535
    Language-Independent Product Name Search (LIPNS) similarity score. For
536
    further information, see the sim_mlipns documentation.
537
538
    The arguments are identical to those of the hamming() function.
539
540
    :param str src, tar: two strings to be compared
541
    :param bool allow_different_lengths:
542
        If True (default), this returns the Hamming distance for those
543
        characters that have a matching character in both strings plus the
544
        difference in the strings' lengths. This is equivalent to extending
545
        the shorter string with obligatorily non-matching characters.
546
        If False, an exception is raised in the case of strings of unequal
547
        lengths.
548
    :returns: normalized Hamming similarity
549
    :rtype: float
550
551
    >>> sim_hamming('cat', 'hat')
552
    0.6666666666666667
553
    >>> sim_hamming('Niall', 'Neil')
554
    0.4
555
    >>> sim_hamming('aluminum', 'Catalan')
556
    0.0
557
    >>> sim_hamming('ATCG', 'TAGC')
558
    0.0
559
    """
560
    return 1 - dist_hamming(src, tar, difflens)
561
562
563
def _get_qgrams(src, tar, qval=None, skip=0):
564
    """Return the Q-Grams in src & tar.
565
566
    :param str src, tar: two strings to be compared
567
        (or QGrams/Counter objects)
568
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
569
        version
570
    :param int skip: the number of characters to skip (only works when
571
        src and tar are strings
572
    :return: Q-Grams
573
    """
574
    if isinstance(src, Counter) and isinstance(tar, Counter):
575
        return src, tar
576
    if qval and qval > 0:
577
        return (QGrams(src, qval, '$#', skip),
578
                QGrams(tar, qval, '$#', skip))
579
    return Counter(src.strip().split()), Counter(tar.strip().split())
580
581
582
def sim_tversky(src, tar, qval=2, alpha=1, beta=1, bias=None):
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
583
    r"""Return the Tversky index of two strings.
584
585
    Tversky index
586
587
    The Tversky index is defined as:
588
    For two sets X and Y:
589
    :math:`sim_{Tversky}(X, Y) = \\frac{|X \\cap Y|}
590
    {|X \\cap Y| + \\alpha|X - Y| + \\beta|Y - X|}`
591
592
    Cf. https://en.wikipedia.org/wiki/Tversky_index
593
594
    :math:`\\alpha = \\beta = 1` is equivalent to the Jaccard & Tanimoto
595
    similarity coefficients.
596
597
    :math:`\\alpha = \\beta = 0.5` is equivalent to the Sørensen-Dice
598
    similarity coefficient.
599
600
    Unequal α and β will tend to emphasize one or the other set's
601
    contributions:
602
603
        - :math:`\\alpha > \\beta` emphasizes the contributions of X over Y
604
        - :math:`\\alpha < \\beta` emphasizes the contributions of Y over X)
605
606
    Parameter values' relation to 1 emphasizes different types of
607
    contributions:
608
609
        - :math:`\\alpha and \\beta > 1` emphsize unique contributions over the
610
          intersection
611
        - :math:`\\alpha and \\beta < 1` emphsize the intersection over unique
612
          contributions
613
614
    The symmetric variant is defined in Jiminez, Sergio, Claudio Becerra, and
615
    Alexander Gelbukh. 2013. SOFTCARDINALITY-CORE: Improving Text Overlap with
616
    Distributional Measures for Semantic Textual Similarity. This is activated
617
    by specifying a bias parameter.
618
    Cf. http://aclweb.org/anthology/S/S13/S13-1028.pdf
619
620
    :param str src, tar: two strings to be compared
621
        (or QGrams/Counter objects)
622
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
623
        version
624
    :param float alpha, beta: two Tversky index parameters as indicated in the
625
        description below
626
    :returns: Tversky similarity
627
    :rtype: float
628
629
    >>> sim_tversky('cat', 'hat')
630
    0.3333333333333333
631
    >>> sim_tversky('Niall', 'Neil')
632
    0.2222222222222222
633
    >>> sim_tversky('aluminum', 'Catalan')
634
    0.0625
635
    >>> sim_tversky('ATCG', 'TAGC')
636
    0.0
637
    """
638
    if alpha < 0 or beta < 0:
639
        raise ValueError('Unsupported weight assignment; alpha and beta ' +
640
                         'must be greater than or equal to 0.')
641
642
    if src == tar:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
643
        return 1.0
644
    elif not src or not tar:
645
        return 0.0
646
647
    q_src, q_tar = _get_qgrams(src, tar, qval)
648
    q_src_mag = sum(q_src.values())
649
    q_tar_mag = sum(q_tar.values())
650
    q_intersection_mag = sum((q_src & q_tar).values())
651
652
    if not q_src or not q_tar:
653
        return 0.0
654
655
    if bias is None:
656
        return q_intersection_mag / (q_intersection_mag + alpha *
657
                                     (q_src_mag - q_intersection_mag) +
658
                                     beta * (q_tar_mag - q_intersection_mag))
659
660
    a_val = min(q_src_mag - q_intersection_mag,
661
                q_tar_mag - q_intersection_mag)
662
    b_val = max(q_src_mag - q_intersection_mag,
663
                q_tar_mag - q_intersection_mag)
664
    c_val = q_intersection_mag + bias
665
    return c_val / (beta * (alpha * a_val + (1 - alpha) * b_val) + c_val)
666
667
668
def dist_tversky(src, tar, qval=2, alpha=1, beta=1, bias=None):
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
669
    """Return the Tverssky distance between two strings.
670
671
    Tversky distance
672
673
    Tversky distance is the complement of the Tvesrsky index (similarity):
674
    :math:`dist_{Tversky} = 1-sim_{Tversky}`
675
676
    The symmetric variant is defined in Jiminez, Sergio, Claudio Becerra, and
677
    Alexander Gelbukh. 2013. SOFTCARDINALITY-CORE: Improving Text Overlap with
678
    Distributional Measures for Semantic Textual Similarity. This is activated
679
    by specifying a bias parameter.
680
    Cf. http://aclweb.org/anthology/S/S13/S13-1028.pdf
681
682
    :param str src, tar: two strings to be compared
683
        (or QGrams/Counter objects)
684
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
685
        version
686
    :param float alpha, beta: two Tversky index parameters as indicated in the
687
        description below
688
    :returns: Tversky distance
689
    :rtype: float
690
691
    >>> dist_tversky('cat', 'hat')
692
    0.6666666666666667
693
    >>> dist_tversky('Niall', 'Neil')
694
    0.7777777777777778
695
    >>> dist_tversky('aluminum', 'Catalan')
696
    0.9375
697
    >>> dist_tversky('ATCG', 'TAGC')
698
    1.0
699
    """
700
    return 1 - sim_tversky(src, tar, qval, alpha, beta, bias)
701
702
703
def sim_dice(src, tar, qval=2):
704
    r"""Return the Sørensen–Dice coefficient of two strings.
705
706
    Sørensen–Dice coefficient
707
708
    For two sets X and Y, the Sørensen–Dice coefficient is
709
    :math:`sim_{dice}(X, Y) = \\frac{2 \\cdot |X \\cap Y|}{|X| + |Y|}`
710
711
    This is identical to the Tanimoto similarity coefficient
712
    and the Tversky index for :math:`\\alpha = \\beta = 0.5`
713
714
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
715
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
716
        version
717
    :returns: Sørensen–Dice similarity
718
    :rtype: float
719
720
    >>> sim_dice('cat', 'hat')
721
    0.5
722
    >>> sim_dice('Niall', 'Neil')
723
    0.36363636363636365
724
    >>> sim_dice('aluminum', 'Catalan')
725
    0.11764705882352941
726
    >>> sim_dice('ATCG', 'TAGC')
727
    0.0
728
    """
729
    return sim_tversky(src, tar, qval, 0.5, 0.5)
730
731
732
def dist_dice(src, tar, qval=2):
733
    """Return the Sørensen–Dice distance between two strings.
734
735
    Sørensen–Dice distance
736
737
    Sørensen–Dice distance is the complemenjt of the Sørensen–Dice coefficient:
738
    :math:`dist_{dice} = 1 - sim_{dice}`
739
740
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
741
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
742
        version
743
    :returns: Sørensen–Dice distance
744
    :rtype: float
745
746
    >>> dist_dice('cat', 'hat')
747
    0.5
748
    >>> dist_dice('Niall', 'Neil')
749
    0.6363636363636364
750
    >>> dist_dice('aluminum', 'Catalan')
751
    0.8823529411764706
752
    >>> dist_dice('ATCG', 'TAGC')
753
    1.0
754
    """
755
    return 1 - sim_dice(src, tar, qval)
756
757
758
def sim_jaccard(src, tar, qval=2):
759
    r"""Return the Jaccard similarity of two strings.
760
761
    Jaccard similarity coefficient
762
763
    For two sets X and Y, the Jaccard similarity coefficient is
764
    :math:`sim_{jaccard}(X, Y) = \\frac{|X \\cap Y|}{|X \\cup Y|}`
765
766
    This is identical to the Tanimoto similarity coefficient
767
    and the Tversky index for :math:`\\alpha = \\beta = 1`
768
769
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
770
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
771
        version
772
    :returns: Jaccard similarity
773
    :rtype: float
774
775
    >>> sim_jaccard('cat', 'hat')
776
    0.3333333333333333
777
    >>> sim_jaccard('Niall', 'Neil')
778
    0.2222222222222222
779
    >>> sim_jaccard('aluminum', 'Catalan')
780
    0.0625
781
    >>> sim_jaccard('ATCG', 'TAGC')
782
    0.0
783
    """
784
    return sim_tversky(src, tar, qval, 1, 1)
785
786
787
def dist_jaccard(src, tar, qval=2):
788
    """Return the Jaccard distance between two strings.
789
790
    Jaccard distance
791
792
    Jaccard distance is the complement of the Jaccard similarity coefficient:
793
    :math:`dist_{Jaccard} = 1 - sim_{Jaccard}`
794
795
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
796
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
797
        version
798
    :returns: Jaccard distance
799
    :rtype: float
800
801
    >>> dist_jaccard('cat', 'hat')
802
    0.6666666666666667
803
    >>> dist_jaccard('Niall', 'Neil')
804
    0.7777777777777778
805
    >>> dist_jaccard('aluminum', 'Catalan')
806
    0.9375
807
    >>> dist_jaccard('ATCG', 'TAGC')
808
    1.0
809
    """
810
    return 1 - sim_jaccard(src, tar, qval)
811
812
813
def sim_overlap(src, tar, qval=2):
814
    r"""Return the overlap coefficient of two strings.
815
816
    Overlap coefficient
817
818
    For two sets X and Y, the overlap coefficient is
819
    :math:`sim_{overlap}(X, Y) = \\frac{|X \\cap Y|}{min(|X|, |Y|)}`
820
821
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
822
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
823
        version
824
    :returns: overlap similarity
825
    :rtype: float
826
827
    >>> sim_overlap('cat', 'hat')
828
    0.5
829
    >>> sim_overlap('Niall', 'Neil')
830
    0.4
831
    >>> sim_overlap('aluminum', 'Catalan')
832
    0.125
833
    >>> sim_overlap('ATCG', 'TAGC')
834
    0.0
835
    """
836
    if src == tar:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
837
        return 1.0
838
    elif not src or not tar:
839
        return 0.0
840
841
    q_src, q_tar = _get_qgrams(src, tar, qval)
842
    q_src_mag = sum(q_src.values())
843
    q_tar_mag = sum(q_tar.values())
844
    q_intersection_mag = sum((q_src & q_tar).values())
845
846
    return q_intersection_mag / min(q_src_mag, q_tar_mag)
847
848
849
def dist_overlap(src, tar, qval=2):
850
    """Return the overlap distance between two strings.
851
852
    Overlap distance
853
854
    Overlap distance is the complement of the overlap coefficient:
855
    :math:`sim_{overlap} = 1 - dist_{overlap}`
856
857
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
858
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
859
        version
860
    :returns: overlap distance
861
    :rtype: float
862
863
    >>> dist_overlap('cat', 'hat')
864
    0.5
865
    >>> dist_overlap('Niall', 'Neil')
866
    0.6
867
    >>> dist_overlap('aluminum', 'Catalan')
868
    0.875
869
    >>> dist_overlap('ATCG', 'TAGC')
870
    1.0
871
    """
872
    return 1 - sim_overlap(src, tar, qval)
873
874
875
def sim_tanimoto(src, tar, qval=2):
876
    r"""Return the Tanimoto similarity of two strings.
877
878
    Tanimoto similarity
879
880
    For two sets X and Y, the Tanimoto similarity coefficient is
881
    :math:`sim_{Tanimoto}(X, Y) = \\frac{|X \\cap Y|}{|X \\cup Y|}`
882
    This is identical to the Jaccard similarity coefficient
883
    and the Tversky index for :math:`\\alpha = \\beta = 1`
884
885
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
886
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
887
        version
888
    :returns: Tanimoto similarity
889
    :rtype: float
890
891
    >>> sim_tanimoto('cat', 'hat')
892
    0.3333333333333333
893
    >>> sim_tanimoto('Niall', 'Neil')
894
    0.2222222222222222
895
    >>> sim_tanimoto('aluminum', 'Catalan')
896
    0.0625
897
    >>> sim_tanimoto('ATCG', 'TAGC')
898
    0.0
899
    """
900
    return sim_jaccard(src, tar, qval)
901
902
903
def tanimoto(src, tar, qval=2):
904
    """Return the Tanimoto distance between two strings.
905
906
    Tanimoto distance
907
908
    Tanimoto distance is :math:`-log_{2}sim_{Tanimoto}`
909
910
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
911
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
912
        version
913
    :returns: Tanimoto distance
914
    :rtype: float
915
916
    >>> tanimoto('cat', 'hat')
917
    -1.5849625007211563
918
    >>> tanimoto('Niall', 'Neil')
919
    -2.1699250014423126
920
    >>> tanimoto('aluminum', 'Catalan')
921
    -4.0
922
    >>> tanimoto('ATCG', 'TAGC')
923
    -inf
924
    """
925
    coeff = sim_jaccard(src, tar, qval)
926
    if coeff != 0:
927
        return math.log(coeff, 2)
928
929
    return float('-inf')
930
931
932
def minkowski(src, tar, qval=2, pval=1, normalize=False):
933
    """Return the Minkowski distance (:math:`L^p-norm`) of two strings.
934
935
    :param src:
936
    :param tar:
937
    :param qval:
938
    :param pval:
939
    :return:
940
    """
941
    q_src, q_tar = _get_qgrams(src, tar, qval)
942
    diffs = ((q_src - q_tar) + (q_tar - q_src)).values()
943
944
    normalizer = 1
945
    if normalize:
946
        totals = (q_src + q_tar).values()
947
        if pval == 0:
948
            normalizer = len(totals)
949
        else:
950
            normalizer = sum(_**pval for _ in totals)**(1 / pval)
951
952
    if pval == float('inf'):
953
        # Chebyshev distance
954
        return max(diffs)/normalizer
955
    if pval == 0:
956
        # This is the l_0 "norm" as developed by David Donoho
957
        return len(diffs)
958
    return sum(_**pval for _ in diffs)**(1 / pval)/normalizer
959
960
961
def dist_minkowski(src, tar, qval=2, pval=1):
962
    """Return Minkowski distance of two strings, normalized to [0, 1].
963
964
    :param src:
965
    :param tar:
966
    :param qval2:
967
    :param pval:
968
    :return:
969
    """
970
    return minkowski(src, tar, qval, pval, True)
971
972
973
def sim_minkowski(src, tar, qval=2, pval=1):
974
    """Return Minkowski similarity of two strings, normalized to [0, 1].
975
976
    :param src:
977
    :param tar:
978
    :param qval2:
979
    :param pval:
980
    :return:
981
    """
982
    return 1-minkowski(src, tar, qval, pval, True)
983
984
985
def manhattan(src, tar, qval=2, normalize=False):
986
    """Return the Manhattan distance between two strings.
987
988
    :param src:
989
    :param tar:
990
    :param qval:
991
    :return:
992
    """
993
    return minkowski(src, tar, qval, 1, normalize)
994
995
996
def dist_manhattan(src, tar, qval=2):
997
    """Return the Manhattan distance between two strings, normalized to [0, 1].
998
999
    This is identical to Canberra distance.
1000
1001
    :param src:
1002
    :param tar:
1003
    :param qval:
1004
    :return:
1005
    """
1006
    return manhattan(src, tar, qval, 1, True)
0 ignored issues
show
Bug introduced by
There seem to be too many positional arguments for this function call.
Loading history...
1007
1008
1009
def sim_manhattan(src, tar, qval=2):
1010
    """Return the Manhattan similarity of two strings, normalized to [0, 1].
1011
1012
    :param src:
1013
    :param tar:
1014
    :param qval:
1015
    :return:
1016
    """
1017
    return 1-manhattan(src, tar, qval, 1, True)
0 ignored issues
show
Bug introduced by
There seem to be too many positional arguments for this function call.
Loading history...
1018
1019
1020
def euclidean(src, tar, qval=2, normalize=False):
1021
    """Return the Euclidean distance between two strings.
1022
1023
    :param src:
1024
    :param tar:
1025
    :param qval:
1026
    :return:
1027
    """
1028
    return minkowski(src, tar, qval, 2, normalize)
1029
1030
1031
def dist_euclidean(src, tar, qval=2):
1032
    """Return the Euclidean distance between two strings, normalized to [0, 1].
1033
1034
    :param src:
1035
    :param tar:
1036
    :param qval:
1037
    :return:
1038
    """
1039
    return euclidean(src, tar, qval, True)
1040
1041
1042
def sim_euclidean(src, tar, qval=2):
1043
    """Return the Euclidean similarity of two strings, normalized to [0, 1].
1044
1045
    :param src:
1046
    :param tar:
1047
    :param qval:
1048
    :return:
1049
    """
1050
    return 1-euclidean(src, tar, qval, True)
1051
1052
1053
def chebyshev(src, tar, qval=2, normalize=False):
1054
    """Return the Chebyshev distance between two strings.
1055
1056
    :param src:
1057
    :param tar:
1058
    :param qval:
1059
    :return:
1060
    """
1061
    return minkowski(src, tar, qval, float('inf'), normalize)
1062
1063
1064
def dist_chebyshev(src, tar, qval=2):
1065
    """Return the Chebyshev distance between two strings, normalized to [0, 1].
1066
1067
    :param src:
1068
    :param tar:
1069
    :param qval:
1070
    :return:
1071
    """
1072
    return chebyshev(src, tar, qval, True)
1073
1074
1075
def sim_chebyshev(src, tar, qval=2):
1076
    """Return the Chebyshev similarity of two strings, normalized to [0, 1].
1077
1078
    :param src:
1079
    :param tar:
1080
    :param qval:
1081
    :return:
1082
    """
1083
    return 1 - chebyshev(src, tar, qval, True)
1084
1085
1086
def sim_cosine(src, tar, qval=2):
1087
    r"""Return the cosine similarity of two strings.
1088
1089
    Cosine similarity (Ochiai coefficient)
1090
1091
    For two sets X and Y, the cosine similarity (Ochiai coefficient) is:
1092
    :math:`sim_{cosine}(X, Y) = \\frac{|X \\cap Y|}{\\sqrt{|X| \\cdot |Y|}}`
1093
1094
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1095
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1096
        version
1097
    :returns: cosine similarity
1098
    :rtype: float
1099
1100
    >>> sim_cosine('cat', 'hat')
1101
    0.5
1102
    >>> sim_cosine('Niall', 'Neil')
1103
    0.3651483716701107
1104
    >>> sim_cosine('aluminum', 'Catalan')
1105
    0.11785113019775793
1106
    >>> sim_cosine('ATCG', 'TAGC')
1107
    0.0
1108
    """
1109
    if src == tar:
1110
        return 1.0
1111
    if not src or not tar:
1112
        return 0.0
1113
1114
    q_src, q_tar = _get_qgrams(src, tar, qval)
1115
    q_src_mag = sum(q_src.values())
1116
    q_tar_mag = sum(q_tar.values())
1117
    q_intersection_mag = sum((q_src & q_tar).values())
1118
1119
    return q_intersection_mag / math.sqrt(q_src_mag * q_tar_mag)
1120
1121
1122
def dist_cosine(src, tar, qval=2):
1123
    """Return the cosine distance between two strings.
1124
1125
    Cosine distance
1126
1127
    Cosine distance is the complement of cosine similarity:
1128
    :math:`dist_{cosine} = 1 - sim_{cosine}`
1129
1130
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1131
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1132
        version
1133
    :returns: cosine distance
1134
    :rtype: float
1135
1136
    >>> dist_cosine('cat', 'hat')
1137
    0.5
1138
    >>> dist_cosine('Niall', 'Neil')
1139
    0.6348516283298893
1140
    >>> dist_cosine('aluminum', 'Catalan')
1141
    0.882148869802242
1142
    >>> dist_cosine('ATCG', 'TAGC')
1143
    1.0
1144
    """
1145
    return 1 - sim_cosine(src, tar, qval)
1146
1147
1148
def sim_strcmp95(src, tar, long_strings=False):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (23/15).
Loading history...
1149
    """Return the strcmp95 similarity of two strings.
1150
1151
    strcmp95 similarity
1152
1153
    This is a Python translation of the C code for strcmp95:
1154
    http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
1155
    The above file is a US Government publication and, accordingly,
1156
    in the public domain.
1157
1158
    This is based on the Jaro-Winkler distance, but also attempts to correct
1159
    for some common typos and frequently confused characters. It is also
1160
    limited to uppercase ASCII characters, so it is appropriate to American
1161
    names, but not much else.
1162
1163
    :param str src, tar: two strings to be compared
1164
    :param bool long_strings: set to True to "Increase the probability of a
1165
        match when the number of matched characters is large.  This option
1166
        allows for a little more tolerance when the strings are large. It is
1167
        not an appropriate test when comparing fixed length fields such as
1168
        phone and social security numbers."
1169
    :returns: strcmp95 similarity
1170
    :rtype: float
1171
1172
    >>> sim_strcmp95('cat', 'hat')
1173
    0.7777777777777777
1174
    >>> sim_strcmp95('Niall', 'Neil')
1175
    0.8454999999999999
1176
    >>> sim_strcmp95('aluminum', 'Catalan')
1177
    0.6547619047619048
1178
    >>> sim_strcmp95('ATCG', 'TAGC')
1179
    0.8333333333333334
1180
    """
1181
    def _inrange(char):
1182
        """Return True if char is in the range (0, 91)."""
1183
        return ord(char) > 0 and ord(char) < 91
1184
1185
    ying = src.strip().upper()
1186
    yang = tar.strip().upper()
1187
1188
    if ying == yang:
1189
        return 1.0
1190
    # If either string is blank - return - added in Version 2
1191
    if not ying or not yang:
1192
        return 0.0
1193
1194
    adjwt = defaultdict(int)
1195
    sp_mx = (
1196
        ('A', 'E'), ('A', 'I'), ('A', 'O'), ('A', 'U'), ('B', 'V'), ('E', 'I'),
1197
        ('E', 'O'), ('E', 'U'), ('I', 'O'), ('I', 'U'), ('O', 'U'), ('I', 'Y'),
1198
        ('E', 'Y'), ('C', 'G'), ('E', 'F'), ('W', 'U'), ('W', 'V'), ('X', 'K'),
1199
        ('S', 'Z'), ('X', 'S'), ('Q', 'C'), ('U', 'V'), ('M', 'N'), ('L', 'I'),
1200
        ('Q', 'O'), ('P', 'R'), ('I', 'J'), ('2', 'Z'), ('5', 'S'), ('8', 'B'),
1201
        ('1', 'I'), ('1', 'L'), ('0', 'O'), ('0', 'Q'), ('C', 'K'), ('G', 'J')
1202
    )
1203
1204
    # Initialize the adjwt array on the first call to the function only.
1205
    # The adjwt array is used to give partial credit for characters that
1206
    # may be errors due to known phonetic or character recognition errors.
1207
    # A typical example is to match the letter "O" with the number "0"
1208
    for i in sp_mx:
1209
        adjwt[(i[0], i[1])] = 3
1210
        adjwt[(i[1], i[0])] = 3
1211
1212
    if len(ying) > len(yang):
1213
        search_range = len(ying)
1214
        minv = len(yang)
1215
    else:
1216
        search_range = len(yang)
1217
        minv = len(ying)
1218
1219
    # Blank out the flags
1220
    ying_flag = [0] * search_range
1221
    yang_flag = [0] * search_range
1222
    search_range = max(0, search_range // 2 - 1)
1223
1224
    # Looking only within the search range, count and flag the matched pairs.
1225
    num_com = 0
1226
    yl1 = len(yang) - 1
1227
    for i in range(len(ying)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
1228
        lowlim = (i - search_range) if (i >= search_range) else 0
1229
        hilim = (i + search_range) if ((i + search_range) <= yl1) else yl1
1230
        for j in range(lowlim, hilim+1):
1231
            if (yang_flag[j] == 0) and (yang[j] == ying[i]):
1232
                yang_flag[j] = 1
1233
                ying_flag[i] = 1
1234
                num_com += 1
1235
                break
1236
1237
    # If no characters in common - return
1238
    if num_com == 0:
1239
        return 0.0
1240
1241
    # Count the number of transpositions
1242
    k = n_trans = 0
1243
    for i in range(len(ying)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
1244
        if ying_flag[i] != 0:
1245
            for j in range(k, len(yang)):
1246
                if yang_flag[j] != 0:
1247
                    k = j + 1
1248
                    break
1249
            if ying[i] != yang[j]:
0 ignored issues
show
introduced by
The variable j does not seem to be defined for all execution paths.
Loading history...
1250
                n_trans += 1
1251
    n_trans = n_trans // 2
1252
1253
    # Adjust for similarities in unmatched characters
1254
    n_simi = 0
1255
    if minv > num_com:
0 ignored issues
show
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
1256
        for i in range(len(ying)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
1257
            if ying_flag[i] == 0 and _inrange(ying[i]):
1258
                for j in range(len(yang)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
1259
                    if yang_flag[j] == 0 and _inrange(yang[j]):
1260
                        if (ying[i], yang[j]) in adjwt:
1261
                            n_simi += adjwt[(ying[i], yang[j])]
1262
                            yang_flag[j] = 2
1263
                            break
1264
    num_sim = n_simi/10.0 + num_com
1265
1266
    # Main weight computation
1267
    weight = num_sim / len(ying) + num_sim / len(yang) + \
1268
        (num_com - n_trans) / num_com
1269
    weight = weight / 3.0
1270
1271
    # Continue to boost the weight if the strings are similar
1272
    if weight > 0.7:
1273
1274
        # Adjust for having up to the first 4 characters in common
1275
        j = 4 if (minv >= 4) else minv
1276
        i = 0
1277
        while (i < j) and (ying[i] == yang[i]) and (not ying[i].isdigit()):
1278
            i += 1
1279
        if i:
1280
            weight += i * 0.1 * (1.0 - weight)
1281
1282
        # Optionally adjust for long strings.
1283
1284
        # After agreeing beginning chars, at least two more must agree and
1285
        # the agreeing characters must be > .5 of remaining characters.
1286
        if (((long_strings) and (minv > 4) and (num_com > i+1) and
1287
             (2*num_com >= minv+i))):
1288
            if not ying[0].isdigit():
1289
                weight += (1.0-weight) * ((num_com-i-1) /
1290
                                          (len(ying)+len(yang)-i*2+2))
1291
1292
    return weight
1293
1294
1295
def dist_strcmp95(src, tar, long_strings=False):
1296
    """Return the strcmp95 distance between two strings.
1297
1298
    strcmp95 distance
1299
1300
    strcmp95 distance is 1 - strcmp95 similarity
1301
1302
    :param str src, tar: two strings to be compared
1303
    :param bool long_strings: set to True to "Increase the probability of a
1304
        match when the number of matched characters is large.  This option
1305
        allows for a little more tolerance when the strings are large. It is
1306
        not an appropriate test when comparing fixed length fields such as
1307
        phone and social security numbers."
1308
    :returns: strcmp95 distance
1309
    :rtype: float
1310
1311
    >>> dist_strcmp95('cat', 'hat')
1312
    0.22222222222222232
1313
    >>> dist_strcmp95('Niall', 'Neil')
1314
    0.15450000000000008
1315
    >>> dist_strcmp95('aluminum', 'Catalan')
1316
    0.34523809523809523
1317
    >>> dist_strcmp95('ATCG', 'TAGC')
1318
    0.16666666666666663
1319
    """
1320
    return 1 - sim_strcmp95(src, tar, long_strings)
1321
1322
1323
def sim_jaro_winkler(src, tar, qval=1, mode='winkler', long_strings=False,
0 ignored issues
show
best-practice introduced by
Too many arguments (7/5)
Loading history...
Comprehensibility introduced by
This function exceeds the maximum number of variables (22/15).
Loading history...
1324
                     boost_threshold=0.7, scaling_factor=0.1):
1325
    """Return the Jaro or Jaro-Winkler similarity of two strings.
1326
1327
    Jaro(-Winkler) distance
1328
1329
    This is Python based on the C code for strcmp95:
1330
    http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
1331
    The above file is a US Government publication and, accordingly,
1332
    in the public domain.
1333
1334
    :param str src, tar: two strings to be compared
1335
    :param int qval: the length of each q-gram (defaults to 1: character-wise
1336
        matching)
1337
    :param str mode: indicates which variant of this distance metric to
1338
        compute:
1339
1340
            - 'winkler' -- computes the Jaro-Winkler distance (default) which
1341
              increases the score for matches near the start of the word
1342
            - 'jaro' -- computes the Jaro distance
1343
1344
    The following arguments apply only when mode is 'winkler':
1345
1346
    :param bool long_strings: set to True to "Increase the probability of a
1347
        match when the number of matched characters is large.  This option
1348
        allows for a little more tolerance when the strings are large.  It is
1349
        not an appropriate test when comparing fixed length fields such as
1350
        phone and social security numbers."
1351
    :param float boost_threshold: a value between 0 and 1, below which the
1352
        Winkler boost is not applied (defaults to 0.7)
1353
    :param float scaling_factor: a value between 0 and 0.25, indicating by how
1354
        much to boost scores for matching prefixes (defaults to 0.1)
1355
1356
    :returns: Jaro or Jaro-Winkler similarity
1357
    :rtype: float
1358
1359
    >>> sim_jaro_winkler('cat', 'hat')
1360
    0.7777777777777777
1361
    >>> sim_jaro_winkler('Niall', 'Neil')
1362
    0.8049999999999999
1363
    >>> sim_jaro_winkler('aluminum', 'Catalan')
1364
    0.6011904761904762
1365
    >>> sim_jaro_winkler('ATCG', 'TAGC')
1366
    0.8333333333333334
1367
1368
    >>> sim_jaro_winkler('cat', 'hat', mode='jaro')
1369
    0.7777777777777777
1370
    >>> sim_jaro_winkler('Niall', 'Neil', mode='jaro')
1371
    0.7833333333333333
1372
    >>> sim_jaro_winkler('aluminum', 'Catalan', mode='jaro')
1373
    0.6011904761904762
1374
    >>> sim_jaro_winkler('ATCG', 'TAGC', mode='jaro')
1375
    0.8333333333333334
1376
    """
1377
    if mode == 'winkler':
1378
        if boost_threshold > 1 or boost_threshold < 0:
1379
            raise ValueError('Unsupported boost_threshold assignment; ' +
1380
                             'boost_threshold must be between 0 and 1.')
1381
        if scaling_factor > 0.25 or scaling_factor < 0:
1382
            raise ValueError('Unsupported scaling_factor assignment; ' +
1383
                             'scaling_factor must be between 0 and 0.25.')
1384
1385
    if src == tar:
1386
        return 1.0
1387
1388
    src = QGrams(src.strip(), qval).ordered_list
1389
    tar = QGrams(tar.strip(), qval).ordered_list
1390
1391
    lens = len(src)
1392
    lent = len(tar)
1393
1394
    # If either string is blank - return - added in Version 2
1395
    if lens == 0 or lent == 0:
1396
        return 0.0
1397
1398
    if lens > lent:
1399
        search_range = lens
1400
        minv = lent
1401
    else:
1402
        search_range = lent
1403
        minv = lens
1404
1405
    # Zero out the flags
1406
    src_flag = [0] * search_range
1407
    tar_flag = [0] * search_range
1408
    search_range = max(0, search_range//2 - 1)
1409
1410
    # Looking only within the search range, count and flag the matched pairs.
1411
    num_com = 0
1412
    yl1 = lent - 1
1413
    for i in range(lens):
1414
        lowlim = (i - search_range) if (i >= search_range) else 0
1415
        hilim = (i + search_range) if ((i + search_range) <= yl1) else yl1
1416
        for j in range(lowlim, hilim+1):
1417
            if (tar_flag[j] == 0) and (tar[j] == src[i]):
1418
                tar_flag[j] = 1
1419
                src_flag[i] = 1
1420
                num_com += 1
1421
                break
1422
1423
    # If no characters in common - return
1424
    if num_com == 0:
1425
        return 0.0
1426
1427
    # Count the number of transpositions
1428
    k = n_trans = 0
1429
    for i in range(lens):
1430
        if src_flag[i] != 0:
1431
            for j in range(k, lent):
1432
                if tar_flag[j] != 0:
1433
                    k = j + 1
1434
                    break
1435
            if src[i] != tar[j]:
0 ignored issues
show
introduced by
The variable j does not seem to be defined for all execution paths.
Loading history...
1436
                n_trans += 1
1437
    n_trans = n_trans // 2
1438
1439
    # Main weight computation for Jaro distance
1440
    weight = num_com / lens + num_com / lent + (num_com - n_trans) / num_com
1441
    weight = weight / 3.0
1442
1443
    # Continue to boost the weight if the strings are similar
1444
    # This is the Winkler portion of Jaro-Winkler distance
1445
    if mode == 'winkler' and weight > boost_threshold:
1446
1447
        # Adjust for having up to the first 4 characters in common
1448
        j = 4 if (minv >= 4) else minv
1449
        i = 0
1450
        while (i < j) and (src[i] == tar[i]):
1451
            i += 1
1452
        if i:
1453
            weight += i * scaling_factor * (1.0 - weight)
1454
1455
        # Optionally adjust for long strings.
1456
1457
        # After agreeing beginning chars, at least two more must agree and
1458
        # the agreeing characters must be > .5 of remaining characters.
1459
        if (((long_strings) and (minv > 4) and (num_com > i+1) and
1460
             (2*num_com >= minv+i))):
1461
            weight += (1.0-weight) * ((num_com-i-1) / (lens+lent-i*2+2))
1462
1463
    return weight
1464
1465
1466
def dist_jaro_winkler(src, tar, qval=1, mode='winkler', long_strings=False,
0 ignored issues
show
best-practice introduced by
Too many arguments (7/5)
Loading history...
1467
                      boost_threshold=0.7, scaling_factor=0.1):
1468
    """Return the Jaro or Jaro-Winkler distance between two strings.
1469
1470
    Jaro(-Winkler) distance
1471
1472
    Jaro-Winkler distance is 1 - the Jaro-Winkler similarity
1473
1474
    :param str src, tar: two strings to be compared
1475
    :param int qval: the length of each q-gram (defaults to 1: character-wise
1476
        matching)
1477
    :param str mode: indicates which variant of this distance metric to
1478
        compute:
1479
1480
            - 'winkler' -- computes the Jaro-Winkler distance (default) which
1481
              increases the score for matches near the start of the word
1482
            - 'jaro' -- computes the Jaro distance
1483
1484
    The following arguments apply only when mode is 'winkler':
1485
1486
    :param bool long_strings: set to True to "Increase the probability of a
1487
        match when the number of matched characters is large.  This option
1488
        allows for a little more tolerance when the strings are large.  It is
1489
        not an appropriate test when comparing fixed length fields such as
1490
        phone and social security numbers."
1491
    :param float boost_threshold: a value between 0 and 1, below which the
1492
        Winkler boost is not applied (defaults to 0.7)
1493
    :param float scaling_factor: a value between 0 and 0.25, indicating by how
1494
        much to boost scores for matching prefixes (defaults to 0.1)
1495
1496
    :returns: Jaro or Jaro-Winkler distance
1497
    :rtype: float
1498
1499
    >>> dist_jaro_winkler('cat', 'hat')
1500
    0.22222222222222232
1501
    >>> dist_jaro_winkler('Niall', 'Neil')
1502
    0.19500000000000006
1503
    >>> dist_jaro_winkler('aluminum', 'Catalan')
1504
    0.39880952380952384
1505
    >>> dist_jaro_winkler('ATCG', 'TAGC')
1506
    0.16666666666666663
1507
1508
    >>> dist_jaro_winkler('cat', 'hat', mode='jaro')
1509
    0.22222222222222232
1510
    >>> dist_jaro_winkler('Niall', 'Neil', mode='jaro')
1511
    0.21666666666666667
1512
    >>> dist_jaro_winkler('aluminum', 'Catalan', mode='jaro')
1513
    0.39880952380952384
1514
    >>> dist_jaro_winkler('ATCG', 'TAGC', mode='jaro')
1515
    0.16666666666666663
1516
    """
1517
    return 1 - sim_jaro_winkler(src, tar, qval, mode, long_strings,
1518
                                boost_threshold, scaling_factor)
1519
1520
1521
def lcsseq(src, tar):
1522
    """Return the longest common subsequence of two strings.
1523
1524
    Longest common subsequence (LCSseq)
1525
1526
    Based on the dynamic programming algorithm from
1527
    http://rosettacode.org/wiki/Longest_common_subsequence#Dynamic_Programming_6
1528
    This is licensed GFDL 1.2
1529
1530
    Modifications include:
1531
        conversion to a numpy array in place of a list of lists
1532
1533
    :param str src, tar: two strings to be compared
1534
    :returns: the longes common subsequence
1535
    :rtype: str
1536
1537
    >>> lcsseq('cat', 'hat')
1538
    'at'
1539
    >>> lcsseq('Niall', 'Neil')
1540
    'Nil'
1541
    >>> lcsseq('aluminum', 'Catalan')
1542
    'aln'
1543
    >>> lcsseq('ATCG', 'TAGC')
1544
    'AC'
1545
    """
1546
    # pylint: disable=no-member
1547
    lengths = np.zeros((len(src)+1, len(tar)+1), dtype=np.int)
1548
    # pylint: enable=no-member
1549
1550
    # row 0 and column 0 are initialized to 0 already
1551
    for i, src_char in enumerate(src):
1552
        for j, tar_char in enumerate(tar):
1553
            if src_char == tar_char:
1554
                lengths[i+1, j+1] = lengths[i, j] + 1
1555
            else:
1556
                lengths[i+1, j+1] = max(lengths[i+1, j], lengths[i, j+1])
1557
1558
    # read the substring out from the matrix
1559
    result = ''
1560
    i, j = len(src), len(tar)
1561
    while i != 0 and j != 0:
1562
        if lengths[i, j] == lengths[i-1, j]:
1563
            i -= 1
1564
        elif lengths[i, j] == lengths[i, j-1]:
1565
            j -= 1
1566
        else:
1567
            result = src[i-1] + result
1568
            i -= 1
1569
            j -= 1
1570
    return result
1571
1572
1573
def sim_lcsseq(src, tar):
1574
    r"""Return the longest common subsequence similarity of two strings.
1575
1576
    Longest common subsequence similarity (:math:`sim_{LCSseq}`)
1577
1578
    This employs the LCSseq function to derive a similarity metric:
1579
    :math:`sim_{LCSseq}(s,t) = \\frac{|LCSseq(s,t)|}{max(|s|, |t|)}`
1580
1581
    :param str src, tar: two strings to be compared
1582
    :returns: LCSseq similarity
1583
    :rtype: float
1584
1585
    >>> sim_lcsseq('cat', 'hat')
1586
    0.6666666666666666
1587
    >>> sim_lcsseq('Niall', 'Neil')
1588
    0.6
1589
    >>> sim_lcsseq('aluminum', 'Catalan')
1590
    0.375
1591
    >>> sim_lcsseq('ATCG', 'TAGC')
1592
    0.5
1593
    """
1594
    if src == tar:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
1595
        return 1.0
1596
    elif not src or not tar:
1597
        return 0.0
1598
    return len(lcsseq(src, tar)) / max(len(src), len(tar))
1599
1600
1601
def dist_lcsseq(src, tar):
1602
    """Return the longest common subsequence distance between two strings.
1603
1604
    Longest common subsequence distance (:math:`dist_{LCSseq}`)
1605
1606
    This employs the LCSseq function to derive a similarity metric:
1607
    :math:`dist_{LCSseq}(s,t) = 1 - sim_{LCSseq}(s,t)`
1608
1609
    :param str src, tar: two strings to be compared
1610
    :returns: LCSseq distance
1611
    :rtype: float
1612
1613
    >>> dist_lcsseq('cat', 'hat')
1614
    0.33333333333333337
1615
    >>> dist_lcsseq('Niall', 'Neil')
1616
    0.4
1617
    >>> dist_lcsseq('aluminum', 'Catalan')
1618
    0.625
1619
    >>> dist_lcsseq('ATCG', 'TAGC')
1620
    0.5
1621
    """
1622
    return 1 - sim_lcsseq(src, tar)
1623
1624
1625
def lcsstr(src, tar):
1626
    """Return the longest common substring of two strings.
1627
1628
    Longest common substring (LCSstr)
1629
1630
    Based on the code from
1631
    https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring#Python
1632
    This is licensed Creative Commons: Attribution-ShareAlike 3.0
1633
1634
    Modifications include:
1635
1636
        - conversion to a numpy array in place of a list of lists
1637
        - conversion to Python 2/3-safe range from xrange via six
1638
1639
    :param str src, tar: two strings to be compared
1640
    :returns: the longes common substring
1641
    :rtype: float
1642
1643
    >>> lcsstr('cat', 'hat')
1644
    'at'
1645
    >>> lcsstr('Niall', 'Neil')
1646
    'N'
1647
    >>> lcsstr('aluminum', 'Catalan')
1648
    'al'
1649
    >>> lcsstr('ATCG', 'TAGC')
1650
    'A'
1651
    """
1652
    # pylint: disable=no-member
1653
    lengths = np.zeros((len(src)+1, len(tar)+1), dtype=np.int)
1654
    # pylint: enable=no-member
1655
    longest, i_longest = 0, 0
1656
    for i in range(1, len(src)+1):
1657
        for j in range(1, len(tar)+1):
1658
            if src[i-1] == tar[j-1]:
1659
                lengths[i, j] = lengths[i-1, j-1] + 1
1660
                if lengths[i, j] > longest:
1661
                    longest = lengths[i, j]
1662
                    i_longest = i
1663
            else:
1664
                lengths[i, j] = 0
1665
    return src[i_longest - longest:i_longest]
1666
1667
1668
def sim_lcsstr(src, tar):
1669
    r"""Return the longest common substring similarity of two strings.
1670
1671
    Longest common substring similarity (:math:`sim_{LCSstr}`)
1672
1673
    This employs the LCS function to derive a similarity metric:
1674
    :math:`sim_{LCSstr}(s,t) = \\frac{|LCSstr(s,t)|}{max(|s|, |t|)}`
1675
1676
    :param str src, tar: two strings to be compared
1677
    :returns: LCSstr similarity
1678
    :rtype: float
1679
1680
    >>> sim_lcsstr('cat', 'hat')
1681
    0.6666666666666666
1682
    >>> sim_lcsstr('Niall', 'Neil')
1683
    0.2
1684
    >>> sim_lcsstr('aluminum', 'Catalan')
1685
    0.25
1686
    >>> sim_lcsstr('ATCG', 'TAGC')
1687
    0.25
1688
    """
1689
    if src == tar:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
1690
        return 1.0
1691
    elif not src or not tar:
1692
        return 0.0
1693
    return len(lcsstr(src, tar)) / max(len(src), len(tar))
1694
1695
1696
def dist_lcsstr(src, tar):
1697
    """Return the longest common substring distance between two strings.
1698
1699
    Longest common substring distance (:math:`dist_{LCSstr}`)
1700
1701
    This employs the LCS function to derive a similarity metric:
1702
    :math:`dist_{LCSstr}(s,t) = 1 - sim_{LCSstr}(s,t)`
1703
1704
    :param str src, tar: two strings to be compared
1705
    :returns: LCSstr distance
1706
    :rtype: float
1707
1708
    >>> dist_lcsstr('cat', 'hat')
1709
    0.33333333333333337
1710
    >>> dist_lcsstr('Niall', 'Neil')
1711
    0.8
1712
    >>> dist_lcsstr('aluminum', 'Catalan')
1713
    0.75
1714
    >>> dist_lcsstr('ATCG', 'TAGC')
1715
    0.75
1716
    """
1717
    return 1 - sim_lcsstr(src, tar)
1718
1719
1720
def sim_ratcliff_obershelp(src, tar):
1721
    """Return the Ratcliff-Obershelp similarity of two strings.
1722
1723
    Ratcliff-Obershelp similarity
1724
1725
    This follows the Ratcliff-Obershelp algorithm to derive a similarity
1726
    measure:
1727
1728
        1. Find the length of the longest common substring in src & tar.
1729
        2. Recurse on the strings to the left & right of each this substring
1730
           in src & tar. The base case is a 0 length common substring, in which
1731
           case, return 0. Otherwise, return the sum of the current longest
1732
           common substring and the left & right recursed sums.
1733
        3. Multiply this length by 2 and divide by the sum of the lengths of
1734
           src & tar.
1735
1736
    Cf.
1737
    http://www.drdobbs.com/database/pattern-matching-the-gestalt-approach/184407970
1738
1739
    :param str src, tar: two strings to be compared
1740
    :returns: Ratcliff-Obserhelp similarity
1741
    :rtype: float
1742
1743
    >>> sim_ratcliff_obershelp('cat', 'hat')
1744
    0.66666666666666663
1745
    >>> sim_ratcliff_obershelp('Niall', 'Neil')
1746
    0.66666666666666663
1747
    >>> sim_ratcliff_obershelp('aluminum', 'Catalan')
1748
    0.40000000000000002
1749
    >>> sim_ratcliff_obershelp('ATCG', 'TAGC')
1750
    0.5
1751
    """
1752
    def _lcsstr_stl(src, tar):
1753
        """Return start positions & length for Ratcliff-Obershelp.
1754
1755
        Return the start position in the source string, start position in
1756
        the target string, and length of the longest common substring of
1757
        strings src and tar.
1758
        """
1759
        # pylint: disable=no-member
1760
        lengths = np.zeros((len(src)+1, len(tar)+1), dtype=np.int)
1761
        # pylint: enable=no-member
1762
        longest, src_longest, tar_longest = 0, 0, 0
1763
        for i in range(1, len(src)+1):
1764
            for j in range(1, len(tar)+1):
1765
                if src[i-1] == tar[j-1]:
1766
                    lengths[i, j] = lengths[i-1, j-1] + 1
1767
                    if lengths[i, j] > longest:
1768
                        longest = lengths[i, j]
1769
                        src_longest = i
1770
                        tar_longest = j
1771
                else:
1772
                    lengths[i, j] = 0
1773
        return (src_longest-longest, tar_longest-longest, longest)
1774
1775
    def _sstr_matches(src, tar):
1776
        """Return the sum of substring match lengths.
1777
1778
        This follows the Ratcliff-Obershelp algorithm:
1779
             1. Find the length of the longest common substring in src & tar.
1780
             2. Recurse on the strings to the left & right of each this
1781
                 substring in src & tar.
1782
             3. Base case is a 0 length common substring, in which case,
1783
                 return 0.
1784
             4. Return the sum.
1785
        """
1786
        src_start, tar_start, length = _lcsstr_stl(src, tar)
1787
        if length == 0:
1788
            return 0
1789
        return (_sstr_matches(src[:src_start], tar[:tar_start]) +
1790
                length +
1791
                _sstr_matches(src[src_start+length:], tar[tar_start+length:]))
1792
1793
    if src == tar:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
1794
        return 1.0
1795
    elif not src or not tar:
1796
        return 0.0
1797
    return 2*_sstr_matches(src, tar)/(len(src)+len(tar))
1798
1799
1800
def dist_ratcliff_obershelp(src, tar):
1801
    """Return the Ratcliff-Obershelp distance between two strings.
1802
1803
    Ratcliff-Obershelp distance
1804
1805
    Ratcliff-Obsershelp distance the complement of Ratcliff-Obershelp
1806
    similarity:
1807
    :math:`dist_{Ratcliff-Obershelp} = 1 - sim_{Ratcliff-Obershelp}`
1808
1809
    :param str src, tar: two strings to be compared
1810
    :returns: Ratcliffe-Obershelp distance
1811
    :rtype: float
1812
1813
    >>> dist_ratcliff_obershelp('cat', 'hat')
1814
    0.33333333333333337
1815
    >>> dist_ratcliff_obershelp('Niall', 'Neil')
1816
    0.33333333333333337
1817
    >>> dist_ratcliff_obershelp('aluminum', 'Catalan')
1818
    0.59999999999999998
1819
    >>> dist_ratcliff_obershelp('ATCG', 'TAGC')
1820
    0.5
1821
    """
1822
    return 1 - sim_ratcliff_obershelp(src, tar)
1823
1824
1825
def mra_compare(src, tar):
1826
    """Return the MRA comparison rating of two strings.
1827
1828
    Western Airlines Surname Match Rating Algorithm comparison rating
1829
1830
    A description of the algorithm can be found on page 18 of
1831
    https://archive.org/details/accessingindivid00moor
1832
1833
    :param str src, tar: two strings to be compared
1834
    :returns: MRA comparison rating
1835
    :rtype: int
1836
1837
    >>> mra_compare('cat', 'hat')
1838
    5
1839
    >>> mra_compare('Niall', 'Neil')
1840
    6
1841
    >>> mra_compare('aluminum', 'Catalan')
1842
    0
1843
    >>> mra_compare('ATCG', 'TAGC')
1844
    5
1845
    """
1846
    if src == tar:
1847
        return 6
1848
    if src == '' or tar == '':
1849
        return 0
1850
    src = list(mra(src))
1851
    tar = list(mra(tar))
1852
1853
    if abs(len(src)-len(tar)) > 2:
1854
        return 0
1855
1856
    length_sum = len(src) + len(tar)
1857
    if length_sum < 5:
1858
        min_rating = 5
1859
    elif length_sum < 8:
1860
        min_rating = 4
1861
    elif length_sum < 12:
1862
        min_rating = 3
1863
    else:
1864
        min_rating = 2
1865
1866
    for _ in range(2):
1867
        new_src = []
1868
        new_tar = []
1869
        minlen = min(len(src), len(tar))
1870
        for i in range(minlen):
1871
            if src[i] != tar[i]:
1872
                new_src.append(src[i])
1873
                new_tar.append(tar[i])
1874
        src = new_src+src[minlen:]
1875
        tar = new_tar+tar[minlen:]
1876
        src.reverse()
1877
        tar.reverse()
1878
1879
    similarity = 6 - max(len(src), len(tar))
1880
1881
    if similarity >= min_rating:
1882
        return similarity
1883
    return 0
1884
1885
1886
def sim_mra(src, tar):
1887
    """Return the normalized MRA similarity of two strings.
1888
1889
    Normalized Match Rating Algorithm similarity
1890
1891
    This is the MRA normalized to :math:`[0, 1]`, given that MRA itself is
1892
    constrained to the range :math:`[0, 6]`.
1893
1894
    :param str src, tar: two strings to be compared
1895
    :returns: normalized MRA similarity
1896
    :rtype: float
1897
1898
    >>> sim_mra('cat', 'hat')
1899
    0.8333333333333334
1900
    >>> sim_mra('Niall', 'Neil')
1901
    1.0
1902
    >>> sim_mra('aluminum', 'Catalan')
1903
    0.0
1904
    >>> sim_mra('ATCG', 'TAGC')
1905
    0.8333333333333334
1906
    """
1907
    return mra_compare(src, tar)/6
1908
1909
1910
def dist_mra(src, tar):
1911
    """Return the normalized MRA distance between two strings.
1912
1913
    Normalized Match Rating Algorithm distance
1914
1915
    MRA distance is the complement of MRA similarity:
1916
    :math:`dist_{MRA} = 1 - sim_{MRA}`
1917
1918
    :param str src, tar: two strings to be compared
1919
    :returns: normalized MRA distance
1920
    :rtype: float
1921
1922
    >>> dist_mra('cat', 'hat')
1923
    0.16666666666666663
1924
    >>> dist_mra('Niall', 'Neil')
1925
    0.0
1926
    >>> dist_mra('aluminum', 'Catalan')
1927
    1.0
1928
    >>> dist_mra('ATCG', 'TAGC')
1929
    0.16666666666666663
1930
    """
1931
    return 1 - sim_mra(src, tar)
1932
1933
1934
def dist_compression(src, tar, compressor='bz2', probs=None):
1935
    """Return the normalized compression distance between two strings.
1936
1937
    Normalized compression distance (NCD)
1938
1939
    Cf.
1940
    https://en.wikipedia.org/wiki/Normalized_compression_distance#Normalized_compression_distance
1941
1942
    :param str src, tar: two strings to be compared
1943
    :param str compressor: a compression scheme to use for the similarity
1944
        calculation, from the following:
1945
1946
            - `zlib` -- standard zlib/gzip
1947
            - `bz2` -- bzip2 (default)
1948
            - `lzma` -- Lempel–Ziv–Markov chain algorithm
1949
            - `arith` -- arithmetic coding
1950
            - `rle` -- run-length encoding
1951
            - `bwtrle` -- Burrows-Wheeler transform followed by run-length
1952
              encoding
1953
1954
    :param doct probs: a dictionary trained with ac_train (for the arith
1955
        compressor only)
1956
    :returns: compression distance
1957
    :rtype: float
1958
1959
    >>> dist_compression('cat', 'hat')
1960
    0.08
1961
    >>> dist_compression('Niall', 'Neil')
1962
    0.037037037037037035
1963
    >>> dist_compression('aluminum', 'Catalan')
1964
    0.20689655172413793
1965
    >>> dist_compression('ATCG', 'TAGC')
1966
    0.037037037037037035
1967
1968
    >>> dist_compression('Niall', 'Neil', compressor='zlib')
1969
    0.45454545454545453
1970
    >>> dist_compression('Niall', 'Neil', compressor='bz2')
1971
    0.037037037037037035
1972
    >>> dist_compression('Niall', 'Neil', compressor='lzma')
1973
    0.16
1974
    >>> dist_compression('Niall', 'Neil', compressor='arith')
1975
    0.6875
1976
    >>> dist_compression('Niall', 'Neil', compressor='rle')
1977
    1.0
1978
    >>> dist_compression('Niall', 'Neil', compressor='bwtrle')
1979
    0.8333333333333334
1980
    """
1981
    if src == tar:
1982
        return 0.0
1983
1984
    if compressor not in {'arith', 'rle', 'bwtrle'}:
1985
        src = src.encode('utf-8')
1986
        tar = tar.encode('utf-8')
1987
1988
    if compressor == 'bz2':
1989
        src_comp = codecs.encode(src, 'bz2_codec')[15:]
1990
        tar_comp = codecs.encode(tar, 'bz2_codec')[15:]
1991
        concat_comp = codecs.encode(src+tar, 'bz2_codec')[15:]
1992
        concat_comp2 = codecs.encode(tar+src, 'bz2_codec')[15:]
1993
    elif compressor == 'lzma':
1994
        if 'lzma' in sys.modules:
1995
            src_comp = lzma.compress(src)[14:]
1996
            tar_comp = lzma.compress(tar)[14:]
1997
            concat_comp = lzma.compress(src+tar)[14:]
1998
            concat_comp2 = lzma.compress(tar+src)[14:]
1999
        else:  # pragma: no cover
2000
            raise ValueError('Install the PylibLZMA module in order to use ' +
2001
                             'lzma compression similarity')
2002
    elif compressor == 'arith':
2003
        if probs is None:
2004
            # lacking a reasonable dictionary, train on the strings themselves
2005
            probs = ac_train(src+tar)
2006
        src_comp = ac_encode(src, probs)[1]
2007
        tar_comp = ac_encode(tar, probs)[1]
2008
        concat_comp = ac_encode(src+tar, probs)[1]
2009
        concat_comp2 = ac_encode(tar+src, probs)[1]
2010
        return ((min(concat_comp, concat_comp2) - min(src_comp, tar_comp)) /
2011
                max(src_comp, tar_comp))
2012
    elif compressor in {'rle', 'bwtrle'}:
2013
        src_comp = rle_encode(src, (compressor == 'bwtrle'))
2014
        tar_comp = rle_encode(tar, (compressor == 'bwtrle'))
2015
        concat_comp = rle_encode(src+tar, (compressor == 'bwtrle'))
2016
        concat_comp2 = rle_encode(tar+src, (compressor == 'bwtrle'))
2017
    else:  # zlib
2018
        src_comp = codecs.encode(src, 'zlib_codec')[2:]
2019
        tar_comp = codecs.encode(tar, 'zlib_codec')[2:]
2020
        concat_comp = codecs.encode(src+tar, 'zlib_codec')[2:]
2021
        concat_comp2 = codecs.encode(tar+src, 'zlib_codec')[2:]
2022
    return ((min(len(concat_comp), len(concat_comp2)) -
2023
             min(len(src_comp), len(tar_comp))) /
2024
            max(len(src_comp), len(tar_comp)))
2025
2026
2027
def sim_compression(src, tar, compressor='bz2', probs=None):
2028
    """Return the normalized compression similarity of two strings.
2029
2030
    Normalized compression similarity (NCS)
2031
2032
    Normalized compression similarity is the complement of normalized
2033
    compression distance:
2034
    :math:`sim_{NCS} = 1 - dist_{NCD}`
2035
2036
    :param str src, tar: two strings to be compared
2037
    :param str compressor: a compression scheme to use for the similarity
2038
        calculation:
2039
2040
            - `zlib` -- standard zlib/gzip
2041
            - `bz2` -- bzip2 (default)
2042
            - `lzma` -- Lempel–Ziv–Markov chain algorithm
2043
            - `arith` -- arithmetic coding
2044
            - `rle` -- run-length encoding
2045
            - `bwtrle` -- Burrows-Wheeler transform followed by run-length
2046
              encoding
2047
2048
    :param dict probs: a dictionary trained with ac_train (for the arith
2049
        compressor only)
2050
    :returns: compression similarity
2051
    :rtype: float
2052
2053
    >>> sim_compression('cat', 'hat')
2054
    0.92
2055
    >>> sim_compression('Niall', 'Neil')
2056
    0.962962962962963
2057
    >>> sim_compression('aluminum', 'Catalan')
2058
    0.7931034482758621
2059
    >>> sim_compression('ATCG', 'TAGC')
2060
    0.962962962962963
2061
2062
    >>> sim_compression('Niall', 'Neil', compressor='zlib')
2063
    0.5454545454545454
2064
    >>> sim_compression('Niall', 'Neil', compressor='bz2')
2065
    0.962962962962963
2066
    >>> sim_compression('Niall', 'Neil', compressor='lzma')
2067
    0.84
2068
    >>> sim_compression('Niall', 'Neil', compressor='arith')
2069
    0.3125
2070
    >>> sim_compression('Niall', 'Neil', compressor='rle')
2071
    0.0
2072
    >>> sim_compression('Niall', 'Neil', compressor='bwtrle')
2073
    0.16666666666666663
2074
    """
2075
    return 1 - dist_compression(src, tar, compressor, probs)
2076
2077
2078
def sim_monge_elkan(src, tar, sim_func=sim_levenshtein, symmetric=False):
2079
    """Return the Monge-Elkan similarity of two strings.
2080
2081
    Monge-Elkan similarity
2082
2083
    Monge-Elkan is defined in:
2084
    Monge, Alvaro E. and Charles P. Elkan. 1996. "The field matching problem:
2085
    Algorithms and applications." KDD-9 Proceedings.
2086
    http://www.aaai.org/Papers/KDD/1996/KDD96-044.pdf
2087
2088
    Note: Monge-Elkan is NOT a symmetric similarity algoritm. Thus, the
2089
    similarity of src to tar is not necessarily equal to the similarity of
2090
    tar to src. If the sym argument is True, a symmetric value is calculated,
2091
    at the cost of doubling the computation time (since the
2092
    :math:`sim_{Monge-Elkan}(src, tar)` and
2093
    :math:`sim_{Monge-Elkan}(tar, src)` are both calculated and then averaged).
2094
2095
    :param str src, tar: two strings to be compared
2096
    :param function sim_func: the internal similarity metric to emply
2097
    :param bool symmetric: return a symmetric similarity measure
2098
    :returns: Monge-Elkan similarity
2099
    :rtype: float
2100
2101
    >>> sim_monge_elkan('cat', 'hat')
2102
    0.75
2103
    >>> sim_monge_elkan('Niall', 'Neil')
2104
    0.66666666666666663
2105
    >>> sim_monge_elkan('aluminum', 'Catalan')
2106
    0.3888888888888889
2107
    >>> sim_monge_elkan('ATCG', 'TAGC')
2108
    0.5
2109
    """
2110
    if src == tar:
2111
        return 1.0
2112
2113
    q_src = sorted(QGrams(src).elements())
2114
    q_tar = sorted(QGrams(tar).elements())
2115
2116
    if not q_src or not q_tar:
2117
        return 0.0
2118
2119
    sum_of_maxes = 0
2120
    for q_s in q_src:
2121
        max_sim = float('-inf')
2122
        for q_t in q_tar:
2123
            max_sim = max(max_sim, sim_func(q_s, q_t))
2124
        sum_of_maxes += max_sim
2125
    sim_em = sum_of_maxes / len(q_src)
2126
2127
    if symmetric:
2128
        sim_em = (sim_em + sim_monge_elkan(tar, src, sim, False))/2
2129
2130
    return sim_em
2131
2132
2133
def dist_monge_elkan(src, tar, sim_func=sim_levenshtein, symmetric=False):
2134
    """Return the Monge-Elkan distance between two strings.
2135
2136
    Monge-Elkan distance
2137
2138
    Monge-Elkan is defined in:
2139
    Monge, Alvaro E. and Charles P. Elkan. 1996. "The field matching problem:
2140
    Algorithms and applications." KDD-9 Proceedings.
2141
    http://www.aaai.org/Papers/KDD/1996/KDD96-044.pdf
2142
2143
    Note: Monge-Elkan is NOT a symmetric similarity algoritm. Thus, the
2144
    distance between src and tar is not necessarily equal to the distance
2145
    between tar and src. If the sym argument is True, a symmetric value is
2146
    calculated, at the cost of doubling the computation time (since the
2147
    :math:`sim_{Monge-Elkan}(src, tar)` and :math:`sim_{Monge-Elkan}(tar, src)`
2148
    are both calculated and then averaged).
2149
2150
    :param str src, tar: two strings to be compared
2151
    :param function sim_func: the internal similarity metric to emply
2152
    :param bool symmetric: return a symmetric similarity measure
2153
    :returns: Monge-Elkan distance
2154
    :rtype: float
2155
2156
    >>> dist_monge_elkan('cat', 'hat')
2157
    0.25
2158
    >>> dist_monge_elkan('Niall', 'Neil')
2159
    0.33333333333333337
2160
    >>> dist_monge_elkan('aluminum', 'Catalan')
2161
    0.61111111111111116
2162
    >>> dist_monge_elkan('ATCG', 'TAGC')
2163
    0.5
2164
    """
2165
    return 1 - sim_monge_elkan(src, tar, sim_func, symmetric)
2166
2167
2168
def sim_ident(src, tar):
2169
    """Return the identity similarity of two strings.
2170
2171
    Identity similarity
2172
2173
    This is 1 if the two strings are identical, otherwise 0.
2174
2175
    :param str src, tar: two strings to be compared
2176
    :returns: identity similarity
2177
    :rtype: int
2178
2179
    >>> sim_ident('cat', 'hat')
2180
    0
2181
    >>> sim_ident('cat', 'cat')
2182
    1
2183
    """
2184
    return int(src == tar)
2185
2186
2187
def dist_ident(src, tar):
2188
    """Return the identity distance between two strings.
2189
2190
    Identity distance
2191
2192
    This is 0 if the two strings are identical, otherwise 1, i.e.
2193
    :math:`dist_{identity} = 1 - sim_{identity}`
2194
2195
    :param str src, tar: two strings to be compared
2196
    :returns: indentity distance
2197
    :rtype: int
2198
2199
    >>> dist_ident('cat', 'hat')
2200
    1
2201
    >>> dist_ident('cat', 'cat')
2202
    0
2203
    """
2204
    return 1 - sim_ident(src, tar)
2205
2206
2207
def sim_matrix(src, tar, mat=None, mismatch_cost=0, match_cost=1,
0 ignored issues
show
best-practice introduced by
Too many arguments (7/5)
Loading history...
2208
               symmetric=True, alphabet=None):
2209
    """Return the matrix similarity of two strings.
2210
2211
    Matrix similarity
2212
2213
    With the default parameters, this is identical to sim_ident.
2214
    It is possible for sim_matrix to return values outside of the range
2215
    :math:`[0, 1]`, if values outside that range are present in mat,
2216
    mismatch_cost, or match_cost.
2217
2218
    :param str src, tar: two strings to be compared
2219
    :param dict mat: a dict mapping tuples to costs; the tuples are (src, tar)
2220
        pairs of symbols from the alphabet parameter
2221
    :param float mismatch_cost: the value returned if (src, tar) is absent from
2222
        mat when src does not equal tar
2223
    :param float match_cost: the value returned if (src, tar) is absent from
2224
        mat when src equals tar
2225
    :param bool symmetric: True if the cost of src not matching tar is
2226
        identical to the cost of tar not matching src; in this case, the values
2227
        in mat need only contain (src, tar) or (tar, src), not both
2228
    :param str alphabet: a collection of tokens from which src and tar are
2229
        drawn; if this is defined a ValueError is raised if either tar or src
2230
        is not found in alphabet
2231
    :returns: matrix similarity
2232
    :rtype: float
2233
2234
    >>> sim_matrix('cat', 'hat')
2235
    0
2236
    >>> sim_matrix('hat', 'hat')
2237
    1
2238
    """
2239
    if alphabet:
2240
        alphabet = tuple(alphabet)
2241
        for i in src:
2242
            if i not in alphabet:
2243
                raise ValueError('src value not in alphabet')
2244
        for i in tar:
2245
            if i not in alphabet:
2246
                raise ValueError('tar value not in alphabet')
2247
2248
    if src == tar:
2249
        if mat and (src, src) in mat:
2250
            return mat[(src, src)]
2251
        return match_cost
2252
    if mat and (src, tar) in mat:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
2253
        return mat[(src, tar)]
2254
    elif symmetric and mat and (tar, src) in mat:
2255
        return mat[(tar, src)]
2256
    return mismatch_cost
2257
2258
2259 View Code Duplication
def needleman_wunsch(src, tar, gap_cost=1, sim_func=sim_ident):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
2260
    """Return the Needleman-Wunsch score of two strings.
2261
2262
    Needleman-Wunsch score
2263
2264
    This is the standard edit distance measure.
2265
2266
    Cf. https://en.wikipedia.org/wiki/Needleman–Wunsch_algorithm
2267
2268
    Cf.
2269
    http://csb.stanford.edu/class/public/readings/Bioinformatics_I_Lecture6/Needleman_Wunsch_JMB_70_Global_alignment.pdf
2270
2271
    :param str src, tar: two strings to be compared
2272
    :param float gap_cost: the cost of an alignment gap (1 by default)
2273
    :param function sim_func: a function that returns the similarity of two
2274
        characters (identity similarity by default)
2275
    :returns: Needleman-Wunsch score
2276
    :rtype: int (in fact dependent on the gap_cost & return value of sim_func)
2277
2278
    >>> needleman_wunsch('cat', 'hat')
2279
    2.0
2280
    >>> needleman_wunsch('Niall', 'Neil')
2281
    1.0
2282
    >>> needleman_wunsch('aluminum', 'Catalan')
2283
    -1.0
2284
    >>> needleman_wunsch('ATCG', 'TAGC')
2285
    0.0
2286
    """
2287
    # pylint: disable=no-member
2288
    d_mat = np.zeros((len(src)+1, len(tar)+1), dtype=np.float)
2289
    # pylint: enable=no-member
2290
2291
    for i in range(len(src)+1):
2292
        d_mat[i, 0] = -(i * gap_cost)
2293
    for j in range(len(tar)+1):
2294
        d_mat[0, j] = -(j * gap_cost)
2295
    for i in range(1, len(src)+1):
2296
        for j in range(1, len(tar)+1):
2297
            match = d_mat[i-1, j-1] + sim_func(src[i-1], tar[j-1])
2298
            delete = d_mat[i-1, j] - gap_cost
2299
            insert = d_mat[i, j-1] - gap_cost
2300
            d_mat[i, j] = max(match, delete, insert)
2301
    return d_mat[d_mat.shape[0]-1, d_mat.shape[1]-1]
2302
2303
2304 View Code Duplication
def smith_waterman(src, tar, gap_cost=1, sim_func=sim_ident):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
2305
    """Return the Smith-Waterman score of two strings.
2306
2307
    Smith-Waterman score
2308
2309
    This is the standard edit distance measure.
2310
2311
    Cf. https://en.wikipedia.org/wiki/Smith–Waterman_algorithm
2312
2313
    :param str src, tar: two strings to be compared
2314
    :param float gap_cost: the cost of an alignment gap (1 by default)
2315
    :param function sim_func: a function that returns the similarity of two
2316
        characters (identity similarity by default)
2317
    :returns: Smith-Waterman score
2318
    :rtype: int (in fact dependent on the gap_cost & return value of sim_func)
2319
2320
    >>> smith_waterman('cat', 'hat')
2321
    2.0
2322
    >>> smith_waterman('Niall', 'Neil')
2323
    1.0
2324
    >>> smith_waterman('aluminum', 'Catalan')
2325
    0.0
2326
    >>> smith_waterman('ATCG', 'TAGC')
2327
    1.0
2328
    """
2329
    # pylint: disable=no-member
2330
    d_mat = np.zeros((len(src)+1, len(tar)+1), dtype=np.float)
2331
    # pylint: enable=no-member
2332
2333
    for i in range(len(src)+1):
2334
        d_mat[i, 0] = 0
2335
    for j in range(len(tar)+1):
2336
        d_mat[0, j] = 0
2337
    for i in range(1, len(src)+1):
2338
        for j in range(1, len(tar)+1):
2339
            match = d_mat[i-1, j-1] + sim_func(src[i-1], tar[j-1])
2340
            delete = d_mat[i-1, j] - gap_cost
2341
            insert = d_mat[i, j-1] - gap_cost
2342
            d_mat[i, j] = max(0, match, delete, insert)
2343
    return d_mat[d_mat.shape[0]-1, d_mat.shape[1]-1]
2344
2345
2346
def gotoh(src, tar, gap_open=1, gap_ext=0.4, sim_func=sim_ident):
2347
    """Return the Gotoh score of two strings.
2348
2349
    Gotoh score
2350
2351
    Gotoh's algorithm is essentially Needleman-Wunsch with affine gap
2352
    penalties:
2353
    https://www.cs.umd.edu/class/spring2003/cmsc838t/papers/gotoh1982.pdf
2354
2355
    :param str src, tar: two strings to be compared
2356
    :param float gap_open: the cost of an open alignment gap (1 by default)
2357
    :param float gap_ext: the cost of an alignment gap extension (0.4 by
2358
        default)
2359
    :param function sim_func: a function that returns the similarity of two
2360
        characters (identity similarity by default)
2361
    :returns: Gotoh score
2362
    :rtype: float (in fact dependent on the gap_cost & return value of
2363
        sim_func)
2364
2365
    >>> gotoh('cat', 'hat')
2366
    2.0
2367
    >>> gotoh('Niall', 'Neil')
2368
    1.0
2369
    >>> gotoh('aluminum', 'Catalan')
2370
    -0.40000000000000002
2371
    >>> gotoh('cat', 'hat')
2372
    2.0
2373
    """
2374
    # pylint: disable=no-member
2375
    d_mat = np.zeros((len(src)+1, len(tar)+1), dtype=np.float)
2376
    p_mat = np.zeros((len(src)+1, len(tar)+1), dtype=np.float)
2377
    q_mat = np.zeros((len(src)+1, len(tar)+1), dtype=np.float)
2378
    # pylint: enable=no-member
2379
2380
    d_mat[0, 0] = 0
2381
    p_mat[0, 0] = float('-inf')
2382
    q_mat[0, 0] = float('-inf')
2383
    for i in range(1, len(src)+1):
2384
        d_mat[i, 0] = float('-inf')
2385
        p_mat[i, 0] = -gap_open - gap_ext*(i-1)
2386
        q_mat[i, 0] = float('-inf')
2387
        q_mat[i, 1] = -gap_open
2388
    for j in range(1, len(tar)+1):
2389
        d_mat[0, j] = float('-inf')
2390
        p_mat[0, j] = float('-inf')
2391
        p_mat[1, j] = -gap_open
2392
        q_mat[0, j] = -gap_open - gap_ext*(j-1)
2393
2394
    for i in range(1, len(src)+1):
2395
        for j in range(1, len(tar)+1):
2396
            sim_val = sim_func(src[i-1], tar[j-1])
2397
            d_mat[i, j] = max(d_mat[i-1, j-1] + sim_val,
2398
                              p_mat[i-1, j-1] + sim_val,
2399
                              q_mat[i-1, j-1] + sim_val)
2400
2401
            p_mat[i, j] = max(d_mat[i-1, j] - gap_open,
2402
                              p_mat[i-1, j] - gap_ext)
2403
2404
            q_mat[i, j] = max(d_mat[i, j-1] - gap_open,
2405
                              q_mat[i, j-1] - gap_ext)
2406
2407
    i, j = (n - 1 for n in d_mat.shape)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable n does not seem to be defined.
Loading history...
2408
    return max(d_mat[i, j], p_mat[i, j], q_mat[i, j])
2409
2410
2411
def sim_length(src, tar):
2412
    """Return the length similarty of two strings.
2413
2414
    Length similarity
2415
2416
    This is the ratio of the length of the shorter string to the longer.
2417
2418
    :param str src, tar: two strings to be compared
2419
    :returns: length similarity
2420
    :rtype: float
2421
2422
    >>> sim_length('cat', 'hat')
2423
    1.0
2424
    >>> sim_length('Niall', 'Neil')
2425
    0.8
2426
    >>> sim_length('aluminum', 'Catalan')
2427
    0.875
2428
    >>> sim_length('ATCG', 'TAGC')
2429
    1.0
2430
    """
2431
    if src == tar:
2432
        return 1.0
2433
    if not src or not tar:
2434
        return 0.0
2435
    return len(src)/len(tar) if len(src) < len(tar) else len(tar)/len(src)
2436
2437
2438
def dist_length(src, tar):
2439
    """Return the length distance between two strings.
2440
2441
    Length distance
2442
2443
    Length distance is the complement of length similarity:
2444
    :math:`dist_{length} = 1 - sim_{length}`
2445
2446
    :param str src, tar: two strings to be compared
2447
    :returns: length distance
2448
    :rtype: float
2449
2450
    >>> dist_length('cat', 'hat')
2451
    0.0
2452
    >>> dist_length('Niall', 'Neil')
2453
    0.19999999999999996
2454
    >>> dist_length('aluminum', 'Catalan')
2455
    0.125
2456
    >>> dist_length('ATCG', 'TAGC')
2457
    0.0
2458
    """
2459
    return 1 - sim_length(src, tar)
2460
2461
2462 View Code Duplication
def sim_prefix(src, tar):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
2463
    """Return the prefix similarty of two strings.
2464
2465
    Prefix similarity
2466
2467
    Prefix similarity is the ratio of the length of the shorter term that
2468
    exactly matches the longer term to the length of the shorter term,
2469
    beginning at the start of both terms.
2470
2471
    :param str src, tar: two strings to be compared
2472
    :returns: prefix similarity
2473
    :rtype: float
2474
2475
    >>> sim_prefix('cat', 'hat')
2476
    0.0
2477
    >>> sim_prefix('Niall', 'Neil')
2478
    0.25
2479
    >>> sim_prefix('aluminum', 'Catalan')
2480
    0.0
2481
    >>> sim_prefix('ATCG', 'TAGC')
2482
    0.0
2483
    """
2484
    if src == tar:
2485
        return 1.0
2486
    if not src or not tar:
2487
        return 0.0
2488
    min_word, max_word = (src, tar) if len(src) < len(tar) else (tar, src)
2489
    min_len = len(min_word)
2490
    for i in range(min_len, 0, -1):
2491
        if min_word[:i] == max_word[:i]:
2492
            return i/min_len
2493
    return 0.0
2494
2495
2496
def dist_prefix(src, tar):
2497
    """Return the prefix distance between two strings.
2498
2499
    Prefix distance
2500
2501
    Prefix distance is the complement of prefix similarity:
2502
    :math:`dist_{prefix} = 1 - sim_{prefix}`
2503
2504
    :param str src, tar: two strings to be compared
2505
    :returns: prefix distance
2506
    :rtype: float
2507
2508
    >>> dist_prefix('cat', 'hat')
2509
    1.0
2510
    >>> dist_prefix('Niall', 'Neil')
2511
    0.75
2512
    >>> dist_prefix('aluminum', 'Catalan')
2513
    1.0
2514
    >>> dist_prefix('ATCG', 'TAGC')
2515
    1.0
2516
    """
2517
    return 1 - sim_prefix(src, tar)
2518
2519
2520 View Code Duplication
def sim_suffix(src, tar):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
2521
    """Return the suffix similarity of two strings.
2522
2523
    Suffix similarity
2524
2525
    Suffix similarity is the ratio of the length of the shorter term that
2526
    exactly matches the longer term to the length of the shorter term,
2527
    beginning at the end of both terms.
2528
2529
    :param str src, tar: two strings to be compared
2530
    :returns: suffix similarity
2531
    :rtype: float
2532
2533
    >>> sim_suffix('cat', 'hat')
2534
    0.6666666666666666
2535
    >>> sim_suffix('Niall', 'Neil')
2536
    0.25
2537
    >>> sim_suffix('aluminum', 'Catalan')
2538
    0.0
2539
    >>> sim_suffix('ATCG', 'TAGC')
2540
    0.0
2541
    """
2542
    if src == tar:
2543
        return 1.0
2544
    if not src or not tar:
2545
        return 0.0
2546
    min_word, max_word = (src, tar) if len(src) < len(tar) else (tar, src)
2547
    min_len = len(min_word)
2548
    for i in range(min_len, 0, -1):
2549
        if min_word[-i:] == max_word[-i:]:
2550
            return i/min_len
2551
    return 0.0
2552
2553
2554
def dist_suffix(src, tar):
2555
    """Return the suffix distance between two strings.
2556
2557
    Suffix distance
2558
2559
    Suffix distance is the complement of suffix similarity:
2560
    :math:`dist_{suffix} = 1 - sim_{suffix}`
2561
2562
    :param str src, tar: two strings to be compared
2563
    :returns: suffix distance
2564
    :rtype: float
2565
2566
    >>> dist_suffix('cat', 'hat')
2567
    0.33333333333333337
2568
    >>> dist_suffix('Niall', 'Neil')
2569
    0.75
2570
    >>> dist_suffix('aluminum', 'Catalan')
2571
    1.0
2572
    >>> dist_suffix('ATCG', 'TAGC')
2573
    1.0
2574
    """
2575
    return 1 - sim_suffix(src, tar)
2576
2577
2578
def sim_mlipns(src, tar, threshold=0.25, maxmismatches=2):
2579
    """Return the MLIPNS similarity of two strings.
2580
2581
    Modified Language-Independent Product Name Search (MLIPNS)
2582
2583
    The MLIPNS algorithm is described in Shannaq, Boumedyen A. N. and Victor V.
2584
    Alexandrov. 2010. "Using Product Similarity for Adding Business." Global
2585
    Journal of Computer Science and Technology. 10(12). 2-8.
2586
    http://www.sial.iias.spb.su/files/386-386-1-PB.pdf
2587
2588
    This function returns only 1.0 (similar) or 0.0 (not similar).
2589
2590
    LIPNS similarity is identical to normalized Hamming similarity.
2591
2592
    :param str src, tar: two strings to be compared
2593
    :param float threshold: a number [0, 1] indicating the maximum similarity
2594
        score, below which the strings are considered 'similar' (0.25 by
2595
        default)
2596
    :param int maxmismatches: a number indicating the allowable number of
2597
        mismatches to remove before declaring two strings not similar (2 by
2598
        default)
2599
    :returns: MLIPNS similarity
2600
    :rtype: float
2601
2602
    >>> sim_mlipns('cat', 'hat')
2603
    1.0
2604
    >>> sim_mlipns('Niall', 'Neil')
2605
    0.0
2606
    >>> sim_mlipns('aluminum', 'Catalan')
2607
    0.0
2608
    >>> sim_mlipns('ATCG', 'TAGC')
2609
    0.0
2610
    """
2611
    if tar == src:
2612
        return 1.0
2613
    if not src or not tar:
2614
        return 0.0
2615
2616
    mismatches = 0
2617
    ham = hamming(src, tar, difflens=True)
2618
    maxlen = max(len(src), len(tar))
2619
    while src and tar and mismatches <= maxmismatches:
2620
        if maxlen < 1 or (1-(maxlen-ham)/maxlen) <= threshold:
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
2621
            return 1.0
2622
        else:
2623
            mismatches += 1
2624
            ham -= 1
2625
            maxlen -= 1
2626
2627
    if maxlen < 1:
2628
        return 1.0
2629
    return 0.0
2630
2631
2632
def dist_mlipns(src, tar, threshold=0.25, maxmismatches=2):
2633
    """Return the MLIPNS distance between two strings.
2634
2635
    Modified Language-Independent Product Name Search (MLIPNS)
2636
2637
    MLIPNS distance is the complement of MLIPNS similarity:
2638
    :math:`dist_{MLIPNS} = 1 - sim_{MLIPNS}`
2639
2640
    This function returns only 0.0 (distant) or 1.0 (not distant)
2641
2642
    :param str src, tar: two strings to be compared
2643
    :param float threshold: a number [0, 1] indicating the maximum similarity
2644
        score, below which the strings are considered 'similar' (0.25 by
2645
        default)
2646
    :param int maxmismatches: a number indicating the allowable number of
2647
        mismatches to remove before declaring two strings not similar (2 by
2648
        default)
2649
    :returns: MLIPNS distance
2650
    :rtype: float
2651
2652
    >>> dist_mlipns('cat', 'hat')
2653
    0.0
2654
    >>> dist_mlipns('Niall', 'Neil')
2655
    1.0
2656
    >>> dist_mlipns('aluminum', 'Catalan')
2657
    1.0
2658
    >>> dist_mlipns('ATCG', 'TAGC')
2659
    1.0
2660
    """
2661
    return 1.0 - sim_mlipns(src, tar, threshold, maxmismatches)
2662
2663
2664
def bag(src, tar):
2665
    """Return the bag distance between two strings.
2666
2667
    Bag distance
2668
2669
    Bag distance is proposed in Bartolini, Illaria, Paolo Ciaccia, and Marco
2670
    Patella. 2002. "String Matching with Metric Trees Using and Approximate
2671
    Distance." Proceedings of the 9th International Symposium on String
2672
    Processing and Information Retrieval, Lisbon, Portugal, September 2002.
2673
    271-283.
2674
    http://www-db.disi.unibo.it/research/papers/SPIRE02.pdf
2675
2676
    It is defined as:
2677
    :math:`max( |multiset(src)-multiset(tar)|, |multiset(tar)-multiset(src)| )`
2678
2679
    :param str src, tar: two strings to be compared
2680
    :returns: bag distance
2681
    :rtype: int
2682
2683
    >>> bag('cat', 'hat')
2684
    1
2685
    >>> bag('Niall', 'Neil')
2686
    2
2687
    >>> bag('aluminum', 'Catalan')
2688
    5
2689
    >>> bag('ATCG', 'TAGC')
2690
    0
2691
    >>> bag('abcdefg', 'hijklm')
2692
    7
2693
    >>> bag('abcdefg', 'hijklmno')
2694
    8
2695
    """
2696
    if tar == src:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
2697
        return 0
2698
    elif not src:
2699
        return len(tar)
2700
    elif not tar:
2701
        return len(src)
2702
2703
    src_bag = Counter(src)
2704
    tar_bag = Counter(tar)
2705
    return max(sum((src_bag-tar_bag).values()),
2706
               sum((tar_bag-src_bag).values()))
2707
2708
2709
def dist_bag(src, tar):
2710
    """Return the normalized bag distance between two strings.
2711
2712
    Normalized bag distance
2713
2714
    Bag distance is normalized by dividing by :math:`max( |src|, |tar| )`.
2715
2716
    :param str src, tar: two strings to be compared
2717
    :returns: normalized bag distance
2718
    :rtype: float
2719
2720
    >>> dist_bag('cat', 'hat')
2721
    0.3333333333333333
2722
    >>> dist_bag('Niall', 'Neil')
2723
    0.4
2724
    >>> dist_bag('aluminum', 'Catalan')
2725
    0.375
2726
    >>> dist_bag('ATCG', 'TAGC')
2727
    0.0
2728
    """
2729
    if tar == src:
2730
        return 0.0
2731
    if not src or not tar:
2732
        return 1.0
2733
2734
    maxlen = max(len(src), len(tar))
2735
2736
    return bag(src, tar)/maxlen
2737
2738
2739
def sim_bag(src, tar):
2740
    """Return the normalized bag similarity of two strings.
2741
2742
    Normalized bag similarity
2743
2744
    Normalized bag similarity is the complement of normalized bag distance:
2745
    :math:`sim_{bag} = 1 - dist_{bag}`
2746
2747
    :param str src, tar: two strings to be compared
2748
    :returns: normalized bag similarity
2749
    :rtype: float
2750
2751
    >>> sim_bag('cat', 'hat')
2752
    0.6666666666666667
2753
    >>> sim_bag('Niall', 'Neil')
2754
    0.6
2755
    >>> sim_bag('aluminum', 'Catalan')
2756
    0.625
2757
    >>> sim_bag('ATCG', 'TAGC')
2758
    1.0
2759
    """
2760
    return 1-dist_bag(src, tar)
2761
2762
2763
def editex(src, tar, cost=(0, 1, 2), local=False):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (16/15).
Loading history...
2764
    """Return the Editex distance between two strings.
2765
2766
    Editex distance
2767
2768
    As described on pages 3 & 4 of
2769
    Zobel, Justin and Philip Dart. 1996. Phonetic string matching: Lessons from
2770
    information retrieval. In: Proceedings of the ACM-SIGIR Conference on
2771
    Research and Development in Information Retrieval, Zurich, Switzerland.
2772
    166–173. https://doi.org/10.1145/243199.243258
2773
2774
    The local variant is based on
2775
    Ring, Nicholas and Alexandra L. Uitdenbogerd. 2009. Finding ‘Lucy in
2776
    Disguise’: The Misheard Lyric Matching Problem. In: Proceedings of the 5th
2777
    Asia Information Retrieval Symposium, Sapporo, Japan. 157-167.
2778
    http://www.seg.rmit.edu.au/research/download.php?manuscript=404
2779
2780
    :param str src, tar: two strings to be compared
2781
    :param tuple cost: a 3-tuple representing the cost of the four possible
2782
        edits:
2783
        match, same-group, and mismatch respectively (by default: (0, 1, 2))
2784
    :param bool local: if True, the local variant of Editex is used
2785
    :returns: Editex distance
2786
    :rtype: int
2787
2788
    >>> editex('cat', 'hat')
2789
    2
2790
    >>> editex('Niall', 'Neil')
2791
    2
2792
    >>> editex('aluminum', 'Catalan')
2793
    12
2794
    >>> editex('ATCG', 'TAGC')
2795
    6
2796
    """
2797
    match_cost, group_cost, mismatch_cost = cost
2798
    letter_groups = ({'A', 'E', 'I', 'O', 'U', 'Y'},
2799
                     {'B', 'P'},
2800
                     {'C', 'K', 'Q'},
2801
                     {'D', 'T'},
2802
                     {'L', 'R'},
2803
                     {'M', 'N'},
2804
                     {'G', 'J'},
2805
                     {'F', 'P', 'V'},
2806
                     {'S', 'X', 'Z'},
2807
                     {'C', 'S', 'Z'})
2808
    all_letters = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'J', 'K', 'L', 'M',
2809
                   'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'X', 'Y', 'Z'}
2810
2811
    def r_cost(ch1, ch2):
2812
        """Return r(a,b) according to Zobel & Dart's definition."""
2813
        if ch1 == ch2:
2814
            return match_cost
2815
        if ch1 in all_letters and ch2 in all_letters:
2816
            for group in letter_groups:
2817
                if ch1 in group and ch2 in group:
2818
                    return group_cost
2819
        return mismatch_cost
2820
2821
    def d_cost(ch1, ch2):
2822
        """Return d(a,b) according to Zobel & Dart's definition."""
2823
        if ch1 != ch2 and (ch1 == 'H' or ch1 == 'W'):
0 ignored issues
show
Unused Code introduced by
Consider merging these comparisons with "in" to "ch1 in ('H', 'W')"
Loading history...
2824
            return group_cost
2825
        return r_cost(ch1, ch2)
2826
2827
    # convert both src & tar to NFKD normalized unicode
2828
    src = unicodedata.normalize('NFKD', text_type(src.upper()))
2829
    tar = unicodedata.normalize('NFKD', text_type(tar.upper()))
2830
    # convert ß to SS (for Python2)
2831
    src = src.replace('ß', 'SS')
2832
    tar = tar.replace('ß', 'SS')
2833
2834
    if src == tar:
2835
        return 0
2836
    if not src:
2837
        return len(tar) * mismatch_cost
2838
    if not tar:
2839
        return len(src) * mismatch_cost
2840
2841
    # pylint: disable=no-member
2842
    d_mat = np.zeros((len(src)+1, len(tar)+1), dtype=np.int)
2843
    # pylint: enable=no-member
2844
    lens = len(src)
2845
    lent = len(tar)
2846
    src = ' '+src
2847
    tar = ' '+tar
2848
2849
    if not local:
2850
        for i in range(1, lens+1):
2851
            d_mat[i, 0] = d_mat[i-1, 0] + d_cost(src[i-1], src[i])
2852
    for j in range(1, lent+1):
2853
        d_mat[0, j] = d_mat[0, j-1] + d_cost(tar[j-1], tar[j])
2854
2855
    for i in range(1, lens+1):
2856
        for j in range(1, lent+1):
2857
            d_mat[i, j] = min(d_mat[i-1, j] + d_cost(src[i-1], src[i]),
2858
                              d_mat[i, j-1] + d_cost(tar[j-1], tar[j]),
2859
                              d_mat[i-1, j-1] + r_cost(src[i], tar[j]))
2860
2861
    return d_mat[lens, lent]
2862
2863
2864
def dist_editex(src, tar, cost=(0, 1, 2), local=False):
2865
    """Return the normalized Editex distance between two strings.
2866
2867
    Editex distance normalized to the interval [0, 1]
2868
2869
    The Editex distance is normalized by dividing the Editex distance
2870
    (calculated by any of the three supported methods) by the greater of
2871
    the number of characters in src times the cost of a delete and
2872
    the number of characters in tar times the cost of an insert.
2873
    For the case in which all operations have :math:`cost = 1`, this is
2874
    equivalent to the greater of the length of the two strings src & tar.
2875
2876
    :param str src, tar: two strings to be compared
2877
    :param tuple cost: a 3-tuple representing the cost of the four possible
2878
        edits:
2879
        match, same-group, and mismatch respectively (by default: (0, 1, 2))
2880
    :param bool local: if True, the local variant of Editex is used
2881
    :returns: normalized Editex distance
2882
    :rtype: float
2883
2884
    >>> dist_editex('cat', 'hat')
2885
    0.33333333333333331
2886
    >>> dist_editex('Niall', 'Neil')
2887
    0.20000000000000001
2888
    >>> dist_editex('aluminum', 'Catalan')
2889
    0.75
2890
    >>> dist_editex('ATCG', 'TAGC')
2891
    0.75
2892
    """
2893
    if src == tar:
2894
        return 0
2895
    mismatch_cost = cost[2]
2896
    return (editex(src, tar, cost, local) /
2897
            (max(len(src)*mismatch_cost, len(tar)*mismatch_cost)))
2898
2899
2900
def sim_editex(src, tar, cost=(0, 1, 2), local=False):
2901
    """Return the normalized Editex similarity of two strings.
2902
2903
    Editex similarity normalized to the interval [0, 1]
2904
2905
    The Editex similarity is the complement of Editex distance
2906
    :math:`sim_{Editex} = 1 - dist_{Editex}`
2907
2908
    The arguments are identical to those of the editex() function.
2909
2910
    :param str src, tar: two strings to be compared
2911
    :param tuple cost: a 3-tuple representing the cost of the four possible
2912
        edits:
2913
        match, same-group, and mismatch respectively (by default: (0, 1, 2))
2914
    :param bool local: if True, the local variant of Editex is used
2915
    :returns: normalized Editex similarity
2916
    :rtype: float
2917
2918
    >>> sim_editex('cat', 'hat')
2919
    0.66666666666666674
2920
    >>> sim_editex('Niall', 'Neil')
2921
    0.80000000000000004
2922
    >>> sim_editex('aluminum', 'Catalan')
2923
    0.25
2924
    >>> sim_editex('ATCG', 'TAGC')
2925
    0.25
2926
    """
2927
    return 1 - dist_editex(src, tar, cost, local)
2928
2929
2930
def eudex_hamming(src, tar, weights='exponential', maxlength=8,
2931
                  normalized=False):
2932
    """Calculate the Hamming distance between the Eudex hashes of two terms.
2933
2934
    - If weights is set to None, a simple Hamming distance is calculated.
2935
    - If weights is set to 'exponential', weight decays by powers of 2, as
2936
      proposed in the eudex specification: https://github.com/ticki/eudex.
2937
    - If weights is set to 'fibonacci', weight decays through the Fibonacci
2938
      series, as in the eudex reference implementation.
2939
    - If weights is set to a callable function, this assumes it creates a
2940
      generator and the generator is used to populate a series of weights.
2941
    - If weights is set to an iterable, the iterable's values should be
2942
      integers and will be used as the weights.
2943
2944
    :param str src, tar: two strings to be compared
2945
    :param iterable or generator function weights:
2946
    :param maxlength: the number of characters to encode as a eudex hash
2947
    :return:
2948
    """
2949
    def _gen_fibonacci():
2950
        """Yield the next Fibonacci number.
2951
2952
        Based on https://www.python-course.eu/generators.php
2953
        Starts at Fibonacci number 3 (the second 1)
2954
        """
2955
        num_a, num_b = 1, 2
2956
        while True:
2957
            yield num_a
2958
            num_a, num_b = num_b, num_a + num_b
2959
2960
    def _gen_exponential(base=2):
2961
        """Yield the next value in an exponential series of the base.
2962
2963
        Based on https://www.python-course.eu/generators.php
2964
        Starts at base**0
2965
        """
2966
        exp = 0
2967
        while True:
2968
            yield base ** exp
2969
            exp += 1
2970
2971
    # Calculate the eudex hashes and XOR them
2972
    xored = eudex(src, maxlength=maxlength) ^ eudex(tar, maxlength=maxlength)
2973
2974
    # Simple hamming distance (all bits are equal)
2975
    if not weights:
2976
        return bin(xored).count('1')
2977
2978
    # If weights is a function, it should create a generator,
2979
    # which we now use to populate a list
2980
    if callable(weights):
2981
        weights = weights()
2982
    elif weights == 'exponential':
2983
        weights = _gen_exponential()
2984
    elif weights == 'fibonacci':
2985
        weights = _gen_fibonacci()
2986
    if isinstance(weights, types.GeneratorType):
2987
        weights = [next(weights) for _ in range(maxlength)][::-1]
2988
2989
    # Sum the weighted hamming distance
2990
    dist = 0
0 ignored issues
show
Comprehensibility Bug introduced by
dist is re-defining a name which is already available in the outer-scope (previously defined on line 3678).

It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior:

param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
Loading history...
2991
    maxdist = 0
2992
    while (xored or normalized) and weights:
2993
        maxdist += 8*weights[-1]
2994
        dist += bin(xored & 0xFF).count('1') * weights.pop()
2995
        xored >>= 8
2996
2997
    if normalized:
2998
        dist /= maxdist
2999
3000
    return dist
3001
3002
3003
def dist_eudex(src, tar, weights='exponential', maxlength=8):
3004
    """Return normalized Hamming distance between Eudex hashes of two terms.
3005
3006
    - If weights is set to None, a simple Hamming distance is calculated.
3007
    - If weights is set to 'exponential', weight decays by powers of 2, as
3008
      proposed in the eudex specification: https://github.com/ticki/eudex.
3009
    - If weights is set to 'fibonacci', weight decays through the Fibonacci
3010
      series, as in the eudex reference implementation.
3011
    - If weights is set to a callable function, this assumes it creates a
3012
      generator and the generator is used to populate a series of weights.
3013
    - If weights is set to an iterable, the iterable's values should be
3014
      integers and will be used as the weights.
3015
3016
    :param str src, tar: two strings to be compared
3017
    :param iterable or generator function weights:
3018
    :param maxlength: the number of characters to encode as a eudex hash
3019
    :return:
3020
    """
3021
    return eudex_hamming(src, tar, weights, maxlength, True)
3022
3023
3024
def sim_eudex(src, tar, weights='exponential', maxlength=8):
3025
    """Return normalized Hamming similarity between Eudex hashes of two terms.
3026
3027
    - If weights is set to None, a simple Hamming distance is calculated.
3028
    - If weights is set to 'exponential', weight decays by powers of 2, as
3029
      proposed in the eudex specification: https://github.com/ticki/eudex.
3030
    - If weights is set to 'fibonacci', weight decays through the Fibonacci
3031
      series, as in the eudex reference implementation.
3032
    - If weights is set to a callable function, this assumes it creates a
3033
      generator and the generator is used to populate a series of weights.
3034
    - If weights is set to an iterable, the iterable's values should be
3035
      integers and will be used as the weights.
3036
3037
    :param str src, tar: two strings to be compared
3038
    :param iterable or generator function weights:
3039
    :param maxlength: the number of characters to encode as a eudex hash
3040
    :return:
3041
    """
3042
    return 1-dist_eudex(src, tar, weights, maxlength)
3043
3044
3045
def sift4_simplest(src, tar, max_offset=0):
3046
    """Return the "simplest" Sift4 distance between two terms.
3047
3048
    This is an approximation of edit distance, described in:
3049
    Zackwehdex, Siderite. 2014. "Super Fast and Accurate string distance
3050
    algorithm: Sift4."
3051
    https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
3052
3053
    :param str src, tar: two strings to be compared
3054
    :param max_offset: the number of characters to search for matching letters
3055
    :return:
3056
    """
3057
    if not src:
3058
        return len(tar)
3059
3060
    if not tar:
3061
        return len(src)
3062
3063
    src_len = len(src)
3064
    tar_len = len(tar)
3065
3066
    src_cur = 0
3067
    tar_cur = 0
3068
    lcss = 0
3069
    local_cs = 0
3070
3071
    while (src_cur < src_len) and (tar_cur < tar_len):
3072
        if src[src_cur] == tar[tar_cur]:
3073
            local_cs += 1
3074
        else:
3075
            lcss += local_cs
3076
            local_cs = 0
3077
            if src_cur != tar_cur:
3078
                src_cur = tar_cur = max(src_cur, tar_cur)
3079
            for i in range(max_offset):
3080
                if not ((src_cur+i < src_len) or (tar_cur+i < tar_len)):
3081
                    break
3082
                if (src_cur+i < src_len) and (src[src_cur+i] == tar[tar_cur]):
3083
                    src_cur += i
3084
                    local_cs += 1
3085
                    break
3086
                if (tar_cur+i < tar_len) and (src[src_cur] == tar[tar_cur+i]):
3087
                    tar_cur += i
3088
                    local_cs += 1
3089
                    break
3090
3091
        src_cur += 1
3092
        tar_cur += 1
3093
3094
    lcss += local_cs
3095
    return round(max(src_len, tar_len) - lcss)
3096
3097
3098
def sift4_common(src, tar, max_offset=0, max_distance=0):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (16/15).
Loading history...
3099
    """Return the "common" Sift4 distance between two terms.
3100
3101
    This is an approximation of edit distance, described in:
3102
    Zackwehdex, Siderite. 2014. "Super Fast and Accurate string distance
3103
    algorithm: Sift4."
3104
    https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
3105
3106
    :param str src, tar: two strings to be compared
3107
    :param max_offset: the number of characters to search for matching letters
3108
    :param max_distance: the distance at which to stop and exit
3109
    :return:
3110
    """
3111
    if not src:
3112
        return len(tar)
3113
3114
    if not tar:
3115
        return len(src)
3116
3117
    src_len = len(src)
3118
    tar_len = len(tar)
3119
3120
    src_cur = 0
3121
    tar_cur = 0
3122
    lcss = 0
3123
    local_cs = 0
3124
    trans = 0
3125
    offset_arr = []
3126
3127
    while (src_cur < src_len) and (tar_cur < tar_len):
3128
        if src[src_cur] == tar[tar_cur]:
3129
            local_cs += 1
3130
            is_trans = False
3131
            i = 0
3132
            while i < len(offset_arr):
3133
                ofs = offset_arr[i]
3134
                if src_cur <= ofs['src_cur'] or tar_cur <= ofs['tar_cur']:
3135
                    is_trans = (abs(tar_cur-src_cur) >=
3136
                                abs(ofs['tar_cur']-ofs['src_cur']))
3137
                    if is_trans:
3138
                        trans += 1
3139
                    elif not ofs['trans']:
3140
                        ofs['trans'] = True
3141
                        trans += 1
3142
                    break
3143
                elif src_cur > ofs['tar_cur'] and tar_cur > ofs['src_cur']:
3144
                    del offset_arr[i]
3145
                else:
3146
                    i += 1
3147
3148
            offset_arr.append({'src_cur': src_cur, 'tar_cur': tar_cur,
3149
                               'trans': is_trans})
3150
        else:
3151
            lcss += local_cs
3152
            local_cs = 0
3153
            if src_cur != tar_cur:
3154
                src_cur = tar_cur = min(src_cur, tar_cur)
3155
            for i in range(max_offset):
3156
                if not ((src_cur+i < src_len) or (tar_cur+i < tar_len)):
3157
                    break
3158
                if (src_cur+i < src_len) and (src[src_cur+i] == tar[tar_cur]):
3159
                    src_cur += i-1
3160
                    tar_cur -= 1
3161
                    break
3162
                if (tar_cur+i < tar_len) and (src[src_cur] == tar[tar_cur+i]):
3163
                    src_cur -= 1
3164
                    tar_cur += i-1
3165
                    break
3166
3167
        src_cur += 1
3168
        tar_cur += 1
3169
3170
        if max_distance:
3171
            temporary_distance = max(src_cur, tar_cur) - lcss + trans
3172
            if temporary_distance >= max_distance:
3173
                return round(temporary_distance)
3174
3175
        if (src_cur >= src_len) or (tar_cur >= tar_len):
3176
            lcss += local_cs
3177
            local_cs = 0
3178
            src_cur = tar_cur = min(src_cur, tar_cur)
3179
3180
    lcss += local_cs
3181
    return round(max(src_len, tar_len) - lcss + trans)
3182
3183
3184
def dist_sift4(src, tar, max_offset=0, max_distance=0):
3185
    """Return the normalized "common" Sift4 distance between two terms.
3186
3187
    This is an approximation of edit distance, described in:
3188
    Zackwehdex, Siderite. 2014. "Super Fast and Accurate string distance
3189
    algorithm: Sift4."
3190
    https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
3191
3192
    :param str src, tar: two strings to be compared
3193
    :param max_offset: the number of characters to search for matching letters
3194
    :param max_distance: the distance at which to stop and exit
3195
    :return:
3196
    """
3197
    return (sift4_common(src, tar, max_offset, max_distance) /
3198
            (max(len(src), len(tar))))
3199
3200
3201
def sim_sift4(src, tar, max_offset=0, max_distance=0):
3202
    """Return the normalized "common" Sift4 similarity of two terms.
3203
3204
    This is an approximation of edit distance, described in:
3205
    Zackwehdex, Siderite. 2014. "Super Fast and Accurate string distance
3206
    algorithm: Sift4."
3207
    https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
3208
3209
    :param str src, tar: two strings to be compared
3210
    :param max_offset: the number of characters to search for matching letters
3211
    :param max_distance: the distance at which to stop and exit
3212
    :return:
3213
    """
3214
    return 1-dist_sift4(src, tar, max_offset, max_distance)
3215
3216
3217
def sim_baystat(src, tar, min_ss_len=None, left_ext=None, right_ext=None):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (16/15).
Loading history...
3218
    """Return the Baystat similarity.
3219
3220
    Good results for shorter words are reported when setting min_ss_len to 1
3221
    and either left_ext OR right_ext to 1.
3222
3223
    The Baystat similarity is defined in:
3224
    Fürnrohr, Michael, Birgit Rimmelspacher, and Tilman von Roncador. 2002.
3225
    "Zusammenführung von Datenbeständen ohne numerische Identifikatoren: ein
3226
    Verfahren im Rahmen der Testuntersuchungen zu einem registergestützten
3227
    Zensus." Bayern in Zahlen, 2002(7). 308--321.
3228
    https://www.statistik.bayern.de/medien/statistik/zensus/zusammenf__hrung_von_datenbest__nden_ohne_numerische_identifikatoren.pdf
3229
3230
    This is ostensibly a port of the R module PPRL's implementation:
3231
    https://github.com/cran/PPRL/blob/master/src/MTB_Baystat.cpp
3232
    As such, this could be made more pythonic.
3233
3234
    :param str src, tar: two strings to be compared
3235
    :param int min_ss_len: minimum substring length to be considered
3236
    :param int left_ext: left-side extension length
3237
    :param int right_ext: right-side extension length
3238
    :rtype: float
3239
    :return: the Baystat similarity
3240
    """
3241
    if src == tar:
3242
        return 1
3243
    if not src or not tar:
3244
        return 0
3245
3246
    max_len = max(len(src), len(tar))
3247
3248
    if not (min_ss_len and left_ext and right_ext):
3249
        # These can be set via arguments to the function. Otherwise they are
3250
        # set automatically based on values from the article.
3251
        if max_len >= 7:
3252
            min_ss_len = 2
3253
            left_ext = 2
3254
            right_ext = 2
3255
        else:
3256
            # The paper suggests that for short names, (exclusively) one or the
3257
            # other of left_ext and right_ext can be 1, with good results.
3258
            # I use 0 & 0 as the default in this case.
3259
            min_ss_len = 1
3260
            left_ext = 0
3261
            right_ext = 0
3262
3263
    pos = 0
3264
    match_len = 0
3265
3266
    while (True):
0 ignored issues
show
Unused Code Coding Style introduced by
There is an unnecessary parenthesis after while.
Loading history...
3267
        if pos + min_ss_len > len(src):
3268
            return match_len/max_len
3269
3270
        hit_len = 0
3271
        ix = 1
0 ignored issues
show
Coding Style Naming introduced by
Variable name "ix" doesn't conform to snake_case naming style ('(([a-z_][a-z0-9_]2,)|(_[a-z0-9_]*)|(__[a-z][a-z0-9_]+__))$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
3272
3273
        substring = src[pos:pos + min_ss_len]
3274
        search_begin = pos - left_ext
3275
3276
        if search_begin < 0:
3277
            search_begin = 0
3278
            left_ext_len = pos
3279
        else:
3280
            left_ext_len = left_ext
3281
3282
        if pos + min_ss_len + right_ext >= len(tar):
3283
            right_ext_len = len(tar) - pos - min_ss_len
3284
        else:
3285
            right_ext_len = right_ext
3286
3287
        if (search_begin + left_ext_len + min_ss_len + right_ext_len >
3288
                search_begin):
3289
            search_val = tar[search_begin:(search_begin + left_ext_len +
3290
                                           min_ss_len + right_ext_len)]
3291
        else:
3292
            search_val = ''
3293
3294
        flagged_tar = ''
3295
        while substring in search_val and pos + ix <= len(src):
3296
            hit_len = len(substring)
3297
            flagged_tar = tar.replace(substring, '#'*hit_len)
3298
3299
            if pos + min_ss_len + ix <= len(src):
3300
                substring = src[pos:pos + min_ss_len + ix]
3301
3302
            if pos+min_ss_len + right_ext_len + 1 <= len(tar):
3303
                right_ext_len += 1
3304
3305
            if (search_begin + left_ext_len + min_ss_len + right_ext_len <=
3306
                    len(tar)):
3307
                search_val = tar[search_begin:(search_begin + left_ext_len +
3308
                                               min_ss_len + right_ext_len)]
3309
3310
            ix += 1
0 ignored issues
show
Coding Style Naming introduced by
Variable name "ix" doesn't conform to snake_case naming style ('(([a-z_][a-z0-9_]2,)|(_[a-z0-9_]*)|(__[a-z][a-z0-9_]+__))$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
3311
3312
        if hit_len > 0:
3313
            tar = flagged_tar
3314
3315
        match_len += hit_len
3316
        pos += ix
3317
3318
3319
def dist_baystat(src, tar, min_ss_len=None, left_ext=None, right_ext=None):
3320
    """Return the Baystat distance.
3321
3322
    :param str src, tar: two strings to be compared
3323
    :param int min_ss_len: minimum substring length to be considered
3324
    :param int left_ext: left-side extension length
3325
    :param int right_ext: right-side extension length
3326
    :rtype: float
3327
    :return: the Baystat distance
3328
    """
3329
    return 1-sim_baystat(src, tar, min_ss_len, left_ext, right_ext)
3330
3331
3332
def typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5)):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (23/15).
Loading history...
3333
    """Return the typo distance between two strings.
3334
3335
    This is inspired by Wayne Song's Typo-Distance
3336
    (https://github.com/wsong/Typo-Distance), and a fair bit of this was
3337
    copied from his module. Compared to the original, this has supports
3338
    different metrics for substitution.
3339
3340
    :param str src, tar: two strings to be compared
3341
    :param str metric: supported values include: 'euclidean', 'manhattan',
3342
          'log-euclidean', and 'log-manhattan'
3343
    :param tuple cost: a 4-tuple representing the cost of the four possible
3344
        edits: inserts, deletes, substitutions, and shift, respectively (by
3345
        default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be
3346
        significantly less than the cost of an insertion & deletion unless
3347
        a log metric is used.
3348
    :return:
3349
    """
3350
    ins_cost, del_cost, sub_cost, shift_cost = cost
3351
3352
    if src == tar:
3353
        return 0.0
3354
    if not src:
3355
        return len(tar) * ins_cost
3356
    if not tar:
3357
        return len(src) * del_cost
3358
3359
    layout = {'QWERTY': (
3360
        (('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '-', '='),
3361
         ('', 'q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '[', ']',
3362
          '\\'),
3363
         ('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', ';', '\''),
3364
         ('', 'z', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '/'),
3365
         ('', '', ' ', ' ', ' ', ' ', ' ', ' ', ' ')),
3366
        (('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+'),
3367
         ('', 'Q', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '{', '}', '|'),
3368
         ('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', ':', '"'),
3369
         ('', 'Z', 'X', 'C', 'V', 'B', 'N', 'M', '<', '>', '?'),
3370
         ('', '', ' ', ' ', ' ', ' ', ' ', ' ', ' '))
3371
    )}
3372
3373
    keyboard = layout['QWERTY']
3374
    lowercase = {item for sublist in keyboard[0] for item in sublist}
3375
    uppercase = {item for sublist in keyboard[1] for item in sublist}
3376
3377
    def _kb_array_for_char(char):
3378
        """Return the keyboard layout that contains ch."""
3379
        if char in lowercase:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
3380
            return keyboard[0]
3381
        elif char in uppercase:
3382
            return keyboard[1]
3383
        else:
3384
            raise ValueError(char + ' not found in any keyboard layouts')
3385
3386
    def _get_char_coord(char, keyboard):
3387
        """Return the row & column of char in the keyboard."""
3388
        for row in keyboard:
3389
            if char in row:
3390
                return keyboard.index(row), row.index(char)
3391
        raise ValueError(char + ' not found in given keyboard layout')
3392
3393
    def _euclidean_keyboard_distance(char1, char2):
3394
        row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1))
3395
        row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2))
3396
        return ((row1 - row2) ** 2 + (col1 - col2) ** 2) ** 0.5
3397
3398
    def _manhattan_keyboard_distance(char1, char2):
3399
        row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1))
3400
        row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2))
3401
        return abs(row1 - row2) + abs(col1 - col2)
3402
3403
    def _log_euclidean_keyboard_distance(char1, char2):
3404
        return math.log(1 + _euclidean_keyboard_distance(char1, char2))
3405
3406
    def _log_manhattan_keyboard_distance(char1, char2):
3407
        return math.log(1 + _manhattan_keyboard_distance(char1, char2))
3408
3409
    metric_dict = {'euclidean': _euclidean_keyboard_distance,
3410
                   'manhattan': _manhattan_keyboard_distance,
3411
                   'log-euclidean': _log_euclidean_keyboard_distance,
3412
                   'log-manhattan': _log_manhattan_keyboard_distance}
3413
3414
    def substitution_cost(char1, char2):
3415
        cost = sub_cost
3416
        cost *= (metric_dict[metric](char1, char2) +
3417
                 shift_cost * (_kb_array_for_char(char1) !=
3418
                               _kb_array_for_char(char2)))
3419
        return cost
3420
3421
    d_mat = np.zeros((len(src) + 1, len(tar) + 1), dtype=np.float32)
3422
    for i in range(len(src) + 1):
3423
        d_mat[i, 0] = i * del_cost
3424
    for j in range(len(tar) + 1):
3425
        d_mat[0, j] = j * ins_cost
3426
3427
    for i in range(len(src)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
3428
        for j in range(len(tar)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
3429
            d_mat[i + 1, j + 1] = min(
3430
                d_mat[i + 1, j] + ins_cost,  # ins
3431
                d_mat[i, j + 1] + del_cost,  # del
3432
                d_mat[i, j] + (substitution_cost(src[i], tar[j])
3433
                               if src[i] != tar[j] else 0)  # sub/==
3434
            )
3435
3436
    return d_mat[len(src), len(tar)]
3437
3438
3439
def dist_typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5)):
3440
    """Return the normalized typo distance between two strings.
3441
3442
    This is inspired by Wayne Song's Typo-Distance
3443
    (https://github.com/wsong/Typo-Distance), and a fair bit of this was
3444
    copied from his module. Compared to the original, this has supports
3445
    different metrics for substitution.
3446
3447
    :param str src, tar: two strings to be compared
3448
    :param str metric: supported values include: 'euclidean', 'manhattan',
3449
          'log-euclidean', and 'log-manhattan'
3450
    :param tuple cost: a 4-tuple representing the cost of the four possible
3451
        edits: inserts, deletes, substitutions, and shift, respectively (by
3452
        default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be
3453
        significantly less than the cost of an insertion & deletion unless
3454
        a log metric is used.
3455
    :return:
3456
    """
3457
    if src == tar:
3458
        return 0
3459
    ins_cost, del_cost = cost[:2]
3460
    return (typo(src, tar, metric, cost) /
3461
            (max(len(src)*del_cost, len(tar)*ins_cost)))
3462
3463
3464
def sim_typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5)):
3465
    """Return the normalized typo similarity between two strings.
3466
3467
    This is inspired by Wayne Song's Typo-Distance
3468
    (https://github.com/wsong/Typo-Distance), and a fair bit of this was
3469
    copied from his module. Compared to the original, this has supports
3470
    different metrics for substitution.
3471
3472
    :param str src, tar: two strings to be compared
3473
    :param str metric: supported values include: 'euclidean', 'manhattan',
3474
          'log-euclidean', and 'log-manhattan'
3475
    :param tuple cost: a 4-tuple representing the cost of the four possible
3476
        edits: inserts, deletes, substitutions, and shift, respectively (by
3477
        default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be
3478
        significantly less than the cost of an insertion & deletion unless
3479
        a log metric is used.
3480
    :return:
3481
    """
3482
    return 1 - dist_typo(src, tar, metric, cost)
3483
3484
3485
def dist_indel(src, tar):
3486
    """Return the indel distance between two strings.
3487
3488
    This is equivalent to levenshtein distance, when only inserts and deletes
3489
    are possible.
3490
3491
    :param str src, tar:
3492
    :return: indel distance
3493
    :rtype: float
3494
    """
3495
    return dist_levenshtein(src, tar, mode='lev', cost=(1, 1, 9999, 9999))
3496
3497
3498
def sim_indel(src, tar):
3499
    """Return the indel similarity of two strings.
3500
3501
    :param str src, tar:
3502
    :return: indel similarity
3503
    :rtype: float
3504
    """
3505
    return sim_levenshtein(src, tar, mode='lev', cost=(1, 1, 9999, 9999))
3506
3507
3508
def synoname(src, tar, word_approx_min=0.3, char_approx_min=0.73,
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (43/15).
Loading history...
Unused Code introduced by
The argument word_approx_min seems to be unused.
Loading history...
3509
             tests=2**11-1):
3510
    """Return the Synoname similarity type of two words.
3511
3512
    :param src:
3513
    :param tar:
3514
    :return:
3515
    """
3516
    punct = {'=', "'", '-', '|', '.', ' '}
0 ignored issues
show
Unused Code introduced by
The variable punct seems to be unused.
Loading history...
3517
    test_dict = {val: n**2 for n, val in enumerate([
3518
        'exact', 'omission', 'substitution', 'transposition', 'punctuation',
3519
        'initials', 'extended', 'inclusion', 'no_first', 'word_approx',
3520
        'confusions', 'char_approx'])}
3521
    match_type_dict = {val: n for n, val in enumerate([
0 ignored issues
show
Unused Code introduced by
The variable match_type_dict seems to be unused.
Loading history...
3522
        'exact', 'omission', 'substitution', 'transposition', 'punctuation',
3523
        'initials', 'extended', 'inclusion', 'no_first', 'word_approx',
3524
        'confusions', 'char_approx'], 1)}
3525
3526
    if isinstance(tests, Iterable):
3527
        new_tests = 0
3528
        for term in tests:
3529
            if term in test_dict:
3530
                new_tests += test_dict[term]
3531
        tests = new_tests
3532
3533
    if isinstance(src, tuple):
3534
        src_ln, src_fn, src_qual = src
3535
    else:
3536
        src_ln, src_fn, src_qual = src.split('#')[1:4]
3537
    if isinstance(tar, tuple):
3538
        tar_ln, tar_fn, tar_qual = tar
3539
    else:
3540
        tar_ln, tar_fn, tar_qual = tar.split('#')[1:4]
3541
3542
    # 1. Preprocessing
3543
3544
    # Lowercasing
3545
    src_fn = src_fn.strip().lower()
3546
    src_ln = src_ln.strip().lower()
3547
    src_qual = src_qual.strip().lower()
3548
3549
    tar_fn = tar_fn.strip().lower()
3550
    tar_ln = tar_ln.strip().lower()
3551
    tar_qual = tar_qual.strip().lower()
3552
3553
    # Create toolcodes
3554
    src_fn, src_ln, src_tc = synoname_toolcode(src_fn, src_ln, src_qual)
3555
    tar_fn, tar_ln, tar_tc = synoname_toolcode(tar_fn, tar_ln, tar_qual)
3556
3557
    src_qualcode = int(src_tc[0])
0 ignored issues
show
Unused Code introduced by
The variable src_qualcode seems to be unused.
Loading history...
3558
    src_punctcode = int(src_tc[1])
0 ignored issues
show
Unused Code introduced by
The variable src_punctcode seems to be unused.
Loading history...
3559
    src_generation = int(src_tc[2])
3560
    src_romancode = int(src_tc[3:6])
3561
    src_len_first = int(src_tc[6:8])
0 ignored issues
show
Unused Code introduced by
The variable src_len_first seems to be unused.
Loading history...
3562
    src_len_last = int(src_tc[8:10])
0 ignored issues
show
Unused Code introduced by
The variable src_len_last seems to be unused.
Loading history...
3563
    src_tc = src_tc.split('#')
3564
    src_specials = src_tc[1]
3565
    src_search_range = src_tc[2]
0 ignored issues
show
Unused Code introduced by
The variable src_search_range seems to be unused.
Loading history...
3566
    src_len_specials = len(src_specials)
0 ignored issues
show
Unused Code introduced by
The variable src_len_specials seems to be unused.
Loading history...
3567
3568
    tar_qualcode = int(tar_tc[0])
0 ignored issues
show
Unused Code introduced by
The variable tar_qualcode seems to be unused.
Loading history...
3569
    tar_punctcode = int(tar_tc[1])
0 ignored issues
show
Unused Code introduced by
The variable tar_punctcode seems to be unused.
Loading history...
3570
    tar_generation = int(tar_tc[2])
3571
    tar_romancode = int(tar_tc[3:6])
3572
    tar_len_first = int(tar_tc[6:8])
0 ignored issues
show
Unused Code introduced by
The variable tar_len_first seems to be unused.
Loading history...
3573
    tar_len_last = int(tar_tc[8:10])
0 ignored issues
show
Unused Code introduced by
The variable tar_len_last seems to be unused.
Loading history...
3574
    tar_tc = tar_tc.split('#')
3575
    tar_specials = tar_tc[1]
3576
    tar_search_range = tar_tc[2]
0 ignored issues
show
Unused Code introduced by
The variable tar_search_range seems to be unused.
Loading history...
3577
    tar_len_specials = len(tar_specials)
0 ignored issues
show
Unused Code introduced by
The variable tar_len_specials seems to be unused.
Loading history...
3578
3579
    qual_conflict = src_qual != tar_qual
0 ignored issues
show
Unused Code introduced by
The variable qual_conflict seems to be unused.
Loading history...
3580
    gen_conflict = (src_generation != tar_generation and
3581
                    (src_generation or tar_generation))
3582
    roman_conflict = (src_romancode != tar_romancode and
3583
                      (src_romancode or tar_romancode))
3584
3585
    # approx_c
3586
    def approx_c():
3587
        if gen_conflict or roman_conflict:
3588
            return False, 0.0
3589
        full_name = ' '.join((tar_ln, tar_fn))
3590
        if full_name.startswith('master '):
3591
            full_name = full_name[len('master '):]
3592
            for intro in ['of the ', 'of ', 'known as the ', 'with the ',
3593
                          'with ']:
3594
                if full_name.startswith(intro):
3595
                    full_name = full_name[len(intro):]
3596
3597
        # ca_ratio = simil(cap_full_name, full_name)
3598
        return ca_ratio >= char_approx_min, ca_ratio
3599
3600
    def simil(src, tar):
0 ignored issues
show
Unused Code introduced by
The variable simil seems to be unused.
Loading history...
3601
        return 100*sim_ratcliff_obershelp(src, tar)
3602
3603
    approx_c_result, ca_ratio = approx_c()
0 ignored issues
show
Unused Code introduced by
The variable approx_c_result seems to be unused.
Loading history...
3604
3605
    if ca_ratio >= char_approx_min and ca_ratio >= 70:
3606
        if tests & test_dict['exact'] and src == tar:
3607
            return 1
3608
3609
3610
def sim_tfidf(src, tar, qval=2, docs_src=None, docs_tar=None):
0 ignored issues
show
Unused Code introduced by
The argument docs_tar seems to be unused.
Loading history...
3611
    """Return the TF-IDF similarity of two strings.
3612
3613
    TF-IDF similarity
3614
3615
    This is chiefly based on the "Formal Definition of TF/IDF Distance" at:
3616
    http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/TfIdfDistance.html
3617
3618
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
3619
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
3620
        version
3621
    :param Counter docs_src: a Counter object or string representing the
3622
        document corpus for the src string
3623
    :param Counter docs_tar: a Counter object or string representing the
3624
        document corpus for the tar string (or set to None to use the docs_src
3625
        for both)
3626
    :returns: TF-IDF similarity
3627
    :rtype: float
3628
    """
3629
    if src == tar:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
3630
        return 1.0  # TODO: confirm correctness of this when docs are different
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
3631
    elif not src or not tar:
3632
        return 0.0
3633
3634
    q_src, q_tar = _get_qgrams(src, tar, qval)
3635
3636
    if isinstance(docs_src, Counter):
3637
        q_docs = docs_src
0 ignored issues
show
Unused Code introduced by
The variable q_docs seems to be unused.
Loading history...
3638
    elif qval and qval > 0:
3639
        q_docs = QGrams(docs_src, qval)
3640
    else:
3641
        q_docs = Counter(docs_src.strip().split())
3642
3643
    if not q_src or not q_tar:
3644
        return 0.0
3645
3646
    # TODO: finish implementation
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
3647
    return 0.5  # hardcoded to half
3648
3649
###############################################################################
3650
3651
3652
def sim(src, tar, method=sim_levenshtein):
3653
    """Return a similarity of two strings.
3654
3655
    This is a generalized function for calling other similarity functions.
3656
3657
    :param str src, tar: two strings to be compared
3658
    :param function method: specifies the similarity metric (Levenshtein by
3659
        default)
3660
    :returns: similarity according to the specified function
3661
    :rtype: float
3662
3663
    >>> sim('cat', 'hat')
3664
    0.66666666666666674
3665
    >>> sim('Niall', 'Neil')
3666
    0.40000000000000002
3667
    >>> sim('aluminum', 'Catalan')
3668
    0.125
3669
    >>> sim('ATCG', 'TAGC')
3670
    0.25
3671
    """
3672
    if callable(method):
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
3673
        return method(src, tar)
3674
    else:
3675
        raise AttributeError('Unknown similarity function: ' + str(method))
3676
3677
3678
def dist(src, tar, method=sim_levenshtein):
3679
    """Return a distance between two strings.
3680
3681
    This is a generalized function for calling other distance functions.
3682
3683
    :param str src, tar: two strings to be compared
3684
    :param function method: specifies the similarity metric (Levenshtein by
3685
        default) -- Note that this takes a similarity metric function, not
3686
        a distance metric function.
3687
    :returns: distance according to the specified function
3688
    :rtype: float
3689
3690
    >>> dist('cat', 'hat')
3691
    0.33333333333333326
3692
    >>> dist('Niall', 'Neil')
3693
    0.59999999999999998
3694
    >>> dist('aluminum', 'Catalan')
3695
    0.875
3696
    >>> dist('ATCG', 'TAGC')
3697
    0.75
3698
    """
3699
    if callable(method):
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
3700
        return 1 - method(src, tar)
3701
    else:
3702
        raise AttributeError('Unknown distance function: ' + str(method))
3703
3704
3705
if __name__ == '__main__':
3706
    import doctest
3707
    doctest.testmod()
3708