Test Failed
Push — master ( 52414d...a2edde )
by Chris
10:33
created

abydos.distance.lcsstr()   A

Complexity

Conditions 5

Size

Total Lines 42
Code Lines 12

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 5
eloc 12
nop 2
dl 0
loc 42
rs 9.3333
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.distance.
20
21
The distance module implements string edit distance functions including:
22
23
    - Levenshtein distance
24
    - Optimal String Alignment distance
25
    - Levenshtein-Damerau distance
26
    - Hamming distance
27
    - Tversky index
28
    - Sørensen–Dice coefficient & distance
29
    - Jaccard similarity coefficient & distance
30
    - overlap similarity & distance
31
    - Tanimoto coefficient & distance
32
    - Minkowski distance & similarity
33
    - Manhattan distance & similarity
34
    - Euclidean distance & similarity
35
    - Chebyshev distance & similarity
36
    - cosine similarity & distance
37
    - Jaro distance
38
    - Jaro-Winkler distance (incl. the strcmp95 algorithm variant)
39
    - Longest common substring
40
    - Ratcliff-Obershelp similarity & distance
41
    - Match Rating Algorithm similarity
42
    - Normalized Compression Distance (NCD) & similarity
43
    - Monge-Elkan similarity & distance
44
    - Matrix similarity
45
    - Needleman-Wunsch score
46
    - Smither-Waterman score
47
    - Gotoh score
48
    - Length similarity
49
    - Prefix, Suffix, and Identity similarity & distance
50
    - Modified Language-Independent Product Name Search (MLIPNS) similarity &
51
      distance
52
    - Bag distance
53
    - Editex distance
54
    - Eudex distances
55
    - Sift4 distance
56
    - Baystat distance & similarity
57
    - Typo distance
58
    - Indel distance
59
60
Functions beginning with the prefixes 'sim' and 'dist' are guaranteed to be
61
in the range [0, 1], and sim_X = 1 - dist_X since the two are complements.
62
If a sim_X function is supplied identical src & tar arguments, it is guaranteed
63
to return 1; the corresponding dist_X function is guaranteed to return 0.
64
"""
65
66
from __future__ import division, unicode_literals
67
68
from codecs import encode
69
from collections import Counter, Iterable, defaultdict
70
from math import log, sqrt
71
from sys import maxsize, modules
72
from types import GeneratorType
73
from unicodedata import normalize
74
75
from numpy import float32 as np_float32
76
from numpy import int as np_int
77
from numpy import zeros as np_zeros
78
79
from six import text_type
80
from six.moves import range
81
82
from .compression import ac_encode, ac_train, rle_encode
83
from .fingerprint import synoname_toolcode, _synoname_special_table
84
from .phonetic import eudex, mra
85
from .qgram import QGrams
86
87
try:
88
    import lzma
89
except ImportError:  # pragma: no cover
90
    # If the system lacks the lzma library, that's fine, but lzma comrpession
91
    # similarity won't be supported.
92
    pass
93
94
95
def levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
96
    """Return the Levenshtein distance between two strings.
97
98
    This is the standard edit distance measure. Cf.
99
    :cite:`Levenshtein:1965,Levenshtein:1966`.
100
101
    Two additional variants: optimal string alignment (aka restricted
102
    Damerau-Levenshtein distance) :cite:`Boytsov:2011` and the
103
    Damerau-Levenshtein :cite:`Damerau:1964` distance are also supported.
104
105
    The ordinary Levenshtein & Optimal String Alignment distance both
106
    employ the Wagner-Fischer dynamic programming algorithm
107
    :cite:`Wagner:1974`.
108
109
    Levenshtein edit distance ordinarily has unit insertion, deletion, and
110
    substitution costs.
111
112
    :param str src, tar: two strings to be compared
113
    :param str mode: specifies a mode for computing the Levenshtein distance:
114
115
        - 'lev' (default) computes the ordinary Levenshtein distance,
116
          in which edits may include inserts, deletes, and substitutions
117
        - 'osa' computes the Optimal String Alignment distance, in which
118
          edits may include inserts, deletes, substitutions, and
119
          transpositions but substrings may only be edited once
120
        - 'dam' computes the Damerau-Levenshtein distance, in which
121
          edits may include inserts, deletes, substitutions, and
122
          transpositions and substrings may undergo repeated edits
123
124
    :param tuple cost: a 4-tuple representing the cost of the four possible
125
        edits: inserts, deletes, substitutions, and transpositions,
126
        respectively (by default: (1, 1, 1, 1))
127
    :returns: the Levenshtein distance between src & tar
128
    :rtype: int (may return a float if cost has float values)
129
130
    >>> levenshtein('cat', 'hat')
131
    1
132
    >>> levenshtein('Niall', 'Neil')
133
    3
134
    >>> levenshtein('aluminum', 'Catalan')
135
    7
136
    >>> levenshtein('ATCG', 'TAGC')
137
    3
138
139
    >>> levenshtein('ATCG', 'TAGC', mode='osa')
140
    2
141
    >>> levenshtein('ACTG', 'TAGC', mode='osa')
142
    4
143
144
    >>> levenshtein('ATCG', 'TAGC', mode='dam')
145
    2
146
    >>> levenshtein('ACTG', 'TAGC', mode='dam')
147
    3
148
    """
149
    ins_cost, del_cost, sub_cost, trans_cost = cost
150
151
    if src == tar:
152
        return 0
153
    if not src:
154
        return len(tar) * ins_cost
155
    if not tar:
156
        return len(src) * del_cost
157
158
    if 'dam' in mode:
159
        return damerau_levenshtein(src, tar, cost)
160
161
    # pylint: disable=no-member
162
    d_mat = np_zeros((len(src)+1, len(tar)+1), dtype=np_int)
163
    # pylint: enable=no-member
164
    for i in range(len(src)+1):
165
        d_mat[i, 0] = i * del_cost
166
    for j in range(len(tar)+1):
167
        d_mat[0, j] = j * ins_cost
168
169
    for i in range(len(src)):
170
        for j in range(len(tar)):
171
            d_mat[i+1, j+1] = min(
172
                d_mat[i+1, j] + ins_cost,  # ins
173
                d_mat[i, j+1] + del_cost,  # del
174
                d_mat[i, j] + (sub_cost if src[i] != tar[j] else 0)  # sub/==
175
            )
176
177
            if mode == 'osa':
178
                if ((i+1 > 1 and j+1 > 1 and src[i] == tar[j-1] and
179
                     src[i-1] == tar[j])):
180
                    # transposition
181
                    d_mat[i+1, j+1] = min(d_mat[i+1, j+1],
182
                                          d_mat[i-1, j-1] + trans_cost)
183
184
    return d_mat[len(src), len(tar)]
185
186
187
def dist_levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
188
    """Return the normalized Levenshtein distance between two strings.
189
190
    The Levenshtein distance is normalized by dividing the Levenshtein distance
191
    (calculated by any of the three supported methods) by the greater of
192
    the number of characters in src times the cost of a delete and
193
    the number of characters in tar times the cost of an insert.
194
    For the case in which all operations have :math:`cost = 1`, this is
195
    equivalent to the greater of the length of the two strings src & tar.
196
197
    :param str src, tar: two strings to be compared
198
    :param str mode: specifies a mode for computing the Levenshtein distance:
199
200
        - 'lev' (default) computes the ordinary Levenshtein distance,
201
          in which edits may include inserts, deletes, and substitutions
202
        - 'osa' computes the Optimal String Alignment distance, in which
203
          edits may include inserts, deletes, substitutions, and
204
          transpositions but substrings may only be edited once
205
        - 'dam' computes the Damerau-Levenshtein distance, in which
206
          edits may include inserts, deletes, substitutions, and
207
          transpositions and substrings may undergo repeated edits
208
209
    :param tuple cost: a 4-tuple representing the cost of the four possible
210
        edits: inserts, deletes, substitutions, and transpositions,
211
        respectively (by default: (1, 1, 1, 1))
212
    :returns: normalized Levenshtein distance
213
    :rtype: float
214
215
    >>> round(dist_levenshtein('cat', 'hat'), 12)
216
    0.333333333333
217
    >>> round(dist_levenshtein('Niall', 'Neil'), 12)
218
    0.6
219
    >>> dist_levenshtein('aluminum', 'Catalan')
220
    0.875
221
    >>> dist_levenshtein('ATCG', 'TAGC')
222
    0.75
223
    """
224
    if src == tar:
225
        return 0
226
    ins_cost, del_cost = cost[:2]
227
    return (levenshtein(src, tar, mode, cost) /
228
            (max(len(src)*del_cost, len(tar)*ins_cost)))
229
230
231
def sim_levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
232
    """Return the Levenshtein similarity of two strings.
233
234
    Normalized Levenshtein similarity is the complement of normalized
235
    Levenshtein distance:
236
    :math:`sim_{Levenshtein} = 1 - dist_{Levenshtein}`.
237
238
    :param str src, tar: two strings to be compared
239
    :param str mode: specifies a mode for computing the Levenshtein distance:
240
241
            - 'lev' (default) computes the ordinary Levenshtein distance,
242
              in which edits may include inserts, deletes, and substitutions
243
            - 'osa' computes the Optimal String Alignment distance, in which
244
              edits may include inserts, deletes, substitutions, and
245
              transpositions but substrings may only be edited once
246
            - 'dam' computes the Damerau-Levenshtein distance, in which
247
              edits may include inserts, deletes, substitutions, and
248
              transpositions and substrings may undergo repeated edits
249
250
    :param tuple cost: a 4-tuple representing the cost of the four possible
251
        edits:
252
        inserts, deletes, substitutions, and transpositions, respectively
253
        (by default: (1, 1, 1, 1))
254
    :returns: normalized Levenshtein similarity
255
    :rtype: float
256
257
    >>> round(sim_levenshtein('cat', 'hat'), 12)
258
    0.666666666667
259
    >>> round(sim_levenshtein('Niall', 'Neil'), 12)
260
    0.4
261
    >>> sim_levenshtein('aluminum', 'Catalan')
262
    0.125
263
    >>> sim_levenshtein('ATCG', 'TAGC')
264
    0.25
265
    """
266
    return 1 - dist_levenshtein(src, tar, mode, cost)
267
268
269
def damerau_levenshtein(src, tar, cost=(1, 1, 1, 1)):
270
    """Return the Damerau-Levenshtein distance between two strings.
271
272
    This computes the Damerau-Levenshtein distance :cite:`Damerau:1964`.
273
    Damerau-Levenshtein code is based on Java code by Kevin L. Stern
274
    :cite:`Stern:2014`, under the MIT license:
275
    https://github.com/KevinStern/software-and-algorithms/blob/master/src/main/java/blogspot/software_and_algorithms/stern_library/string/DamerauLevenshteinAlgorithm.java
276
277
    :param str src, tar: two strings to be compared
278
    :param tuple cost: a 4-tuple representing the cost of the four possible
279
        edits:
280
        inserts, deletes, substitutions, and transpositions, respectively
281
        (by default: (1, 1, 1, 1))
282
    :returns: the Damerau-Levenshtein distance between src & tar
283
    :rtype: int (may return a float if cost has float values)
284
285
    >>> damerau_levenshtein('cat', 'hat')
286
    1
287
    >>> damerau_levenshtein('Niall', 'Neil')
288
    3
289
    >>> damerau_levenshtein('aluminum', 'Catalan')
290
    7
291
    >>> damerau_levenshtein('ATCG', 'TAGC')
292
    2
293
    """
294
    ins_cost, del_cost, sub_cost, trans_cost = cost
295
296
    if src == tar:
297
        return 0
298
    if not src:
299
        return len(tar) * ins_cost
300
    if not tar:
301
        return len(src) * del_cost
302
303
    if 2*trans_cost < ins_cost + del_cost:
304
        raise ValueError('Unsupported cost assignment; the cost of two ' +
305
                         'transpositions must not be less than the cost of ' +
306
                         'an insert plus a delete.')
307
308
    # pylint: disable=no-member
309
    d_mat = (np_zeros((len(src))*(len(tar)), dtype=np_int).
310
             reshape((len(src), len(tar))))
311
    # pylint: enable=no-member
312
313
    if src[0] != tar[0]:
314
        d_mat[0, 0] = min(sub_cost, ins_cost + del_cost)
315
316
    src_index_by_character = {}
317
    src_index_by_character[src[0]] = 0
318
    for i in range(1, len(src)):
319
        del_distance = d_mat[i-1, 0] + del_cost
320
        ins_distance = (i+1) * del_cost + ins_cost
321
        match_distance = (i * del_cost +
322
                          (0 if src[i] == tar[0] else sub_cost))
323
        d_mat[i, 0] = min(del_distance, ins_distance, match_distance)
324
325
    for j in range(1, len(tar)):
326
        del_distance = (j+1) * ins_cost + del_cost
327
        ins_distance = d_mat[0, j-1] + ins_cost
328
        match_distance = (j * ins_cost +
329
                          (0 if src[0] == tar[j] else sub_cost))
330
        d_mat[0, j] = min(del_distance, ins_distance, match_distance)
331
332
    for i in range(1, len(src)):
333
        max_src_letter_match_index = (0 if src[i] == tar[0] else -1)
334
        for j in range(1, len(tar)):
335
            candidate_swap_index = (-1 if tar[j] not in
336
                                    src_index_by_character else
337
                                    src_index_by_character[tar[j]])
338
            j_swap = max_src_letter_match_index
339
            del_distance = d_mat[i-1, j] + del_cost
340
            ins_distance = d_mat[i, j-1] + ins_cost
341
            match_distance = d_mat[i-1, j-1]
342
            if src[i] != tar[j]:
343
                match_distance += sub_cost
344
            else:
345
                max_src_letter_match_index = j
346
347
            if candidate_swap_index != -1 and j_swap != -1:
348
                i_swap = candidate_swap_index
349
350
                if i_swap == 0 and j_swap == 0:
351
                    pre_swap_cost = 0
352
                else:
353
                    pre_swap_cost = d_mat[max(0, i_swap-1), max(0, j_swap-1)]
354
                swap_distance = (pre_swap_cost + (i - i_swap - 1) *
355
                                 del_cost + (j - j_swap - 1) * ins_cost +
356
                                 trans_cost)
357
            else:
358
                swap_distance = maxsize
359
360
            d_mat[i, j] = min(del_distance, ins_distance,
361
                              match_distance, swap_distance)
362
        src_index_by_character[src[i]] = i
363
364
    return d_mat[len(src)-1, len(tar)-1]
365
366
367
def dist_damerau(src, tar, cost=(1, 1, 1, 1)):
368
    """Return the Damerau-Levenshtein similarity of two strings.
369
370
    Damerau-Levenshtein distance normalized to the interval [0, 1].
371
372
    The Damerau-Levenshtein distance is normalized by dividing the
373
    Damerau-Levenshtein distance by the greater of
374
    the number of characters in src times the cost of a delete and
375
    the number of characters in tar times the cost of an insert.
376
    For the case in which all operations have :math:`cost = 1`, this is
377
    equivalent to the greater of the length of the two strings src & tar.
378
379
    The arguments are identical to those of the levenshtein() function.
380
381
    :param str src, tar: two strings to be compared
382
    :param tuple cost: a 4-tuple representing the cost of the four possible
383
        edits:
384
        inserts, deletes, substitutions, and transpositions, respectively
385
        (by default: (1, 1, 1, 1))
386
    :returns: normalized Damerau-Levenshtein distance
387
    :rtype: float
388
389
    >>> round(dist_damerau('cat', 'hat'), 12)
390
    0.333333333333
391
    >>> round(dist_damerau('Niall', 'Neil'), 12)
392
    0.6
393
    >>> dist_damerau('aluminum', 'Catalan')
394
    0.875
395
    >>> dist_damerau('ATCG', 'TAGC')
396
    0.5
397
    """
398
    if src == tar:
399
        return 0
400
    ins_cost, del_cost = cost[:2]
401
    return (damerau_levenshtein(src, tar, cost) /
402
            (max(len(src)*del_cost, len(tar)*ins_cost)))
403
404
405
def sim_damerau(src, tar, cost=(1, 1, 1, 1)):
406
    """Return the Damerau-Levenshtein similarity of two strings.
407
408
    Normalized Damerau-Levenshtein similarity the complement of normalized
409
    Damerau-Levenshtein distance:
410
    :math:`sim_{Damerau} = 1 - dist_{Damerau}`.
411
412
    The arguments are identical to those of the levenshtein() function.
413
414
    :param str src, tar: two strings to be compared
415
    :param tuple cost: a 4-tuple representing the cost of the four possible
416
        edits:
417
        inserts, deletes, substitutions, and transpositions, respectively
418
        (by default: (1, 1, 1, 1))
419
    :returns: normalized Damerau-Levenshtein similarity
420
    :rtype: float
421
422
    >>> round(sim_damerau('cat', 'hat'), 12)
423
    0.666666666667
424
    >>> round(sim_damerau('Niall', 'Neil'), 12)
425
    0.4
426
    >>> sim_damerau('aluminum', 'Catalan')
427
    0.125
428
    >>> sim_damerau('ATCG', 'TAGC')
429
    0.5
430
    """
431
    return 1 - dist_damerau(src, tar, cost)
432
433
434
def hamming(src, tar, difflens=True):
435
    """Return the Hamming distance between two strings.
436
437
    Hamming distance :cite:`Hamming:1950` equals the number of character
438
    positions at which two strings differ. For strings of unequal lengths,
439
    it is not normally defined. By default, this implementation calculates the
440
    Hamming distance of the first n characters where n is the lesser of the two
441
    strings' lengths and adds to this the difference in string lengths.
442
443
    :param str src, tar: two strings to be compared
444
    :param bool allow_different_lengths:
445
        If True (default), this returns the Hamming distance for those
446
        characters that have a matching character in both strings plus the
447
        difference in the strings' lengths. This is equivalent to extending
448
        the shorter string with obligatorily non-matching characters.
449
        If False, an exception is raised in the case of strings of unequal
450
        lengths.
451
    :returns: the Hamming distance between src & tar
452
    :rtype: int
453
454
    >>> hamming('cat', 'hat')
455
    1
456
    >>> hamming('Niall', 'Neil')
457
    3
458
    >>> hamming('aluminum', 'Catalan')
459
    8
460
    >>> hamming('ATCG', 'TAGC')
461
    4
462
    """
463
    if not difflens and len(src) != len(tar):
464
        raise ValueError('Undefined for sequences of unequal length; set ' +
465
                         'difflens to True for Hamming distance between ' +
466
                         'strings of unequal lengths.')
467
468
    hdist = 0
469
    if difflens:
470
        hdist += abs(len(src)-len(tar))
471
    hdist += sum(c1 != c2 for c1, c2 in zip(src, tar))
472
473
    return hdist
474
475
476
def dist_hamming(src, tar, difflens=True):
477
    """Return the normalized Hamming distance between two strings.
478
479
    Hamming distance normalized to the interval [0, 1].
480
481
    The Hamming distance is normalized by dividing it
482
    by the greater of the number of characters in src & tar (unless difflens is
483
    set to False, in which case an exception is raised).
484
485
    The arguments are identical to those of the hamming() function.
486
487
    :param str src, tar: two strings to be compared
488
    :param bool allow_different_lengths:
489
        If True (default), this returns the Hamming distance for those
490
        characters that have a matching character in both strings plus the
491
        difference in the strings' lengths. This is equivalent to extending
492
        the shorter string with obligatorily non-matching characters.
493
        If False, an exception is raised in the case of strings of unequal
494
        lengths.
495
    :returns: normalized Hamming distance
496
    :rtype: float
497
498
    >>> round(dist_hamming('cat', 'hat'), 12)
499
    0.333333333333
500
    >>> dist_hamming('Niall', 'Neil')
501
    0.6
502
    >>> dist_hamming('aluminum', 'Catalan')
503
    1.0
504
    >>> dist_hamming('ATCG', 'TAGC')
505
    1.0
506
    """
507
    if src == tar:
508
        return 0
509
    return hamming(src, tar, difflens) / max(len(src), len(tar))
510
511
512
def sim_hamming(src, tar, difflens=True):
513
    """Return the normalized Hamming similarity of two strings.
514
515
    Hamming similarity normalized to the interval [0, 1].
516
517
    Hamming similarity is the complement of normalized Hamming distance:
518
    :math:`sim_{Hamming} = 1 - dist{Hamming}`.
519
520
    Provided that difflens==True, the Hamming similarity is identical to the
521
    Language-Independent Product Name Search (LIPNS) similarity score. For
522
    further information, see the sim_mlipns documentation.
523
524
    The arguments are identical to those of the hamming() function.
525
526
    :param str src, tar: two strings to be compared
527
    :param bool allow_different_lengths:
528
        If True (default), this returns the Hamming distance for those
529
        characters that have a matching character in both strings plus the
530
        difference in the strings' lengths. This is equivalent to extending
531
        the shorter string with obligatorily non-matching characters.
532
        If False, an exception is raised in the case of strings of unequal
533
        lengths.
534
    :returns: normalized Hamming similarity
535
    :rtype: float
536
537
    >>> round(sim_hamming('cat', 'hat'), 12)
538
    0.666666666667
539
    >>> sim_hamming('Niall', 'Neil')
540
    0.4
541
    >>> sim_hamming('aluminum', 'Catalan')
542
    0.0
543
    >>> sim_hamming('ATCG', 'TAGC')
544
    0.0
545
    """
546
    return 1 - dist_hamming(src, tar, difflens)
547
548
549
def _get_qgrams(src, tar, qval=None, skip=0):
550
    """Return the Q-Grams in src & tar.
551
552
    :param str src, tar: two strings to be compared
553
        (or QGrams/Counter objects)
554
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
555
        version
556
    :param int skip: the number of characters to skip (only works when
557
        src and tar are strings
558
    :return: Q-Grams
559
    """
560
    if isinstance(src, Counter) and isinstance(tar, Counter):
561
        return src, tar
562
    if qval and qval > 0:
563
        return (QGrams(src, qval, '$#', skip),
564
                QGrams(tar, qval, '$#', skip))
565
    return Counter(src.strip().split()), Counter(tar.strip().split())
566
567
568
def sim_tversky(src, tar, qval=2, alpha=1, beta=1, bias=None):
569
    r"""Return the Tversky index of two strings.
570
571
    The Tversky index :cite:`Tversky:1977` is defined as:
572
    For two sets X and Y:
573
    :math:`sim_{Tversky}(X, Y) = \\frac{|X \\cap Y|}
574
    {|X \\cap Y| + \\alpha|X - Y| + \\beta|Y - X|}`.
575
576
    :math:`\\alpha = \\beta = 1` is equivalent to the Jaccard & Tanimoto
577
    similarity coefficients.
578
579
    :math:`\\alpha = \\beta = 0.5` is equivalent to the Sørensen-Dice
580
    similarity coefficient :cite:`Dice:1945,Sorensen:1948`.
581
582
    Unequal α and β will tend to emphasize one or the other set's
583
    contributions:
584
585
        - :math:`\\alpha > \\beta` emphasizes the contributions of X over Y
586
        - :math:`\\alpha < \\beta` emphasizes the contributions of Y over X)
587
588
    Parameter values' relation to 1 emphasizes different types of
589
    contributions:
590
591
        - :math:`\\alpha and \\beta > 1` emphsize unique contributions over the
592
          intersection
593
        - :math:`\\alpha and \\beta < 1` emphsize the intersection over unique
594
          contributions
595
596
    The symmetric variant is defined in :cite:`Jiminez:2013`. This is activated
597
    by specifying a bias parameter.
598
599
    :param str src, tar: two strings to be compared
600
        (or QGrams/Counter objects)
601
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
602
        version
603
    :param float alpha, beta: two Tversky index parameters as indicated in the
604
        description below
605
    :returns: Tversky similarity
606
    :rtype: float
607
608
    >>> sim_tversky('cat', 'hat')
609
    0.3333333333333333
610
    >>> sim_tversky('Niall', 'Neil')
611
    0.2222222222222222
612
    >>> sim_tversky('aluminum', 'Catalan')
613
    0.0625
614
    >>> sim_tversky('ATCG', 'TAGC')
615
    0.0
616
    """
617
    if alpha < 0 or beta < 0:
618
        raise ValueError('Unsupported weight assignment; alpha and beta ' +
619
                         'must be greater than or equal to 0.')
620
621
    if src == tar:
622
        return 1.0
623
    elif not src or not tar:
624
        return 0.0
625
626
    q_src, q_tar = _get_qgrams(src, tar, qval)
627
    q_src_mag = sum(q_src.values())
628
    q_tar_mag = sum(q_tar.values())
629
    q_intersection_mag = sum((q_src & q_tar).values())
630
631
    if not q_src or not q_tar:
632
        return 0.0
633
634
    if bias is None:
635
        return q_intersection_mag / (q_intersection_mag + alpha *
636
                                     (q_src_mag - q_intersection_mag) +
637
                                     beta * (q_tar_mag - q_intersection_mag))
638
639
    a_val = min(q_src_mag - q_intersection_mag,
640
                q_tar_mag - q_intersection_mag)
641
    b_val = max(q_src_mag - q_intersection_mag,
642
                q_tar_mag - q_intersection_mag)
643
    c_val = q_intersection_mag + bias
644
    return c_val / (beta * (alpha * a_val + (1 - alpha) * b_val) + c_val)
645
646
647
def dist_tversky(src, tar, qval=2, alpha=1, beta=1, bias=None):
648
    """Return the Tverssky distance between two strings.
649
650
    Tversky distance is the complement of the Tvesrsky index (similarity):
651
    :math:`dist_{Tversky} = 1-sim_{Tversky}`.
652
653
    :param str src, tar: two strings to be compared
654
        (or QGrams/Counter objects)
655
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
656
        version
657
    :param float alpha, beta: two Tversky index parameters as indicated in the
658
        description below
659
    :returns: Tversky distance
660
    :rtype: float
661
662
    >>> dist_tversky('cat', 'hat')
663
    0.6666666666666667
664
    >>> dist_tversky('Niall', 'Neil')
665
    0.7777777777777778
666
    >>> dist_tversky('aluminum', 'Catalan')
667
    0.9375
668
    >>> dist_tversky('ATCG', 'TAGC')
669
    1.0
670
    """
671
    return 1 - sim_tversky(src, tar, qval, alpha, beta, bias)
672
673
674
def sim_dice(src, tar, qval=2):
675
    r"""Return the Sørensen–Dice coefficient of two strings.
676
677
    For two sets X and Y, the Sørensen–Dice coefficient
678
    :cite:`Dice:1945,Sorensen:1948` is
679
    :math:`sim_{dice}(X, Y) = \\frac{2 \\cdot |X \\cap Y|}{|X| + |Y|}`.
680
681
    This is identical to the Tanimoto similarity coefficient
682
    :cite:`Tanimoto:1958` and the Tversky index :cite:`Tversky:1977` for
683
    :math:`\\alpha = \\beta = 0.5`.
684
685
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
686
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
687
        version
688
    :returns: Sørensen–Dice similarity
689
    :rtype: float
690
691
    >>> sim_dice('cat', 'hat')
692
    0.5
693
    >>> sim_dice('Niall', 'Neil')
694
    0.36363636363636365
695
    >>> sim_dice('aluminum', 'Catalan')
696
    0.11764705882352941
697
    >>> sim_dice('ATCG', 'TAGC')
698
    0.0
699
    """
700
    return sim_tversky(src, tar, qval, 0.5, 0.5)
701
702
703
def dist_dice(src, tar, qval=2):
704
    """Return the Sørensen–Dice distance between two strings.
705
706
    Sørensen–Dice distance is the complemenjt of the Sørensen–Dice coefficient:
707
    :math:`dist_{dice} = 1 - sim_{dice}`.
708
709
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
710
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
711
        version
712
    :returns: Sørensen–Dice distance
713
    :rtype: float
714
715
    >>> dist_dice('cat', 'hat')
716
    0.5
717
    >>> dist_dice('Niall', 'Neil')
718
    0.6363636363636364
719
    >>> dist_dice('aluminum', 'Catalan')
720
    0.8823529411764706
721
    >>> dist_dice('ATCG', 'TAGC')
722
    1.0
723
    """
724
    return 1 - sim_dice(src, tar, qval)
725
726
727
def sim_jaccard(src, tar, qval=2):
728
    r"""Return the Jaccard similarity of two strings.
729
730
    For two sets X and Y, the Jaccard similarity coefficient
731
    :cite:`Jaccard:1901` is :math:`sim_{jaccard}(X, Y) =
732
    \\frac{|X \\cap Y|}{|X \\cup Y|}`.
733
734
    This is identical to the Tanimoto similarity coefficient
735
    :cite:`Tanimoto:1958`
736
    and the Tversky index :cite:`Tversky:1977` for
737
    :math:`\\alpha = \\beta = 1`.
738
739
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
740
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
741
        version
742
    :returns: Jaccard similarity
743
    :rtype: float
744
745
    >>> sim_jaccard('cat', 'hat')
746
    0.3333333333333333
747
    >>> sim_jaccard('Niall', 'Neil')
748
    0.2222222222222222
749
    >>> sim_jaccard('aluminum', 'Catalan')
750
    0.0625
751
    >>> sim_jaccard('ATCG', 'TAGC')
752
    0.0
753
    """
754
    return sim_tversky(src, tar, qval, 1, 1)
755
756
757
def dist_jaccard(src, tar, qval=2):
758
    """Return the Jaccard distance between two strings.
759
760
    Jaccard distance is the complement of the Jaccard similarity coefficient:
761
    :math:`dist_{Jaccard} = 1 - sim_{Jaccard}`.
762
763
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
764
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
765
        version
766
    :returns: Jaccard distance
767
    :rtype: float
768
769
    >>> dist_jaccard('cat', 'hat')
770
    0.6666666666666667
771
    >>> dist_jaccard('Niall', 'Neil')
772
    0.7777777777777778
773
    >>> dist_jaccard('aluminum', 'Catalan')
774
    0.9375
775
    >>> dist_jaccard('ATCG', 'TAGC')
776
    1.0
777
    """
778
    return 1 - sim_jaccard(src, tar, qval)
779
780
781
def sim_overlap(src, tar, qval=2):
782
    r"""Return the overlap coefficient of two strings.
783
784
    For two sets X and Y, the overlap coefficient
785
    :cite:`Szymkiewicz:1934,Simpson:1949`, also called the
786
    Szymkiewicz-Simpson coefficient, is
787
    :math:`sim_{overlap}(X, Y) = \\frac{|X \\cap Y|}{min(|X|, |Y|)}`.
788
789
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
790
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
791
        version
792
    :returns: overlap similarity
793
    :rtype: float
794
795
    >>> sim_overlap('cat', 'hat')
796
    0.5
797
    >>> sim_overlap('Niall', 'Neil')
798
    0.4
799
    >>> sim_overlap('aluminum', 'Catalan')
800
    0.125
801
    >>> sim_overlap('ATCG', 'TAGC')
802
    0.0
803
    """
804
    if src == tar:
805
        return 1.0
806
    elif not src or not tar:
807
        return 0.0
808
809
    q_src, q_tar = _get_qgrams(src, tar, qval)
810
    q_src_mag = sum(q_src.values())
811
    q_tar_mag = sum(q_tar.values())
812
    q_intersection_mag = sum((q_src & q_tar).values())
813
814
    return q_intersection_mag / min(q_src_mag, q_tar_mag)
815
816
817
def dist_overlap(src, tar, qval=2):
818
    """Return the overlap distance between two strings.
819
820
    Overlap distance is the complement of the overlap coefficient:
821
    :math:`sim_{overlap} = 1 - dist_{overlap}`.
822
823
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
824
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
825
        version
826
    :returns: overlap distance
827
    :rtype: float
828
829
    >>> dist_overlap('cat', 'hat')
830
    0.5
831
    >>> dist_overlap('Niall', 'Neil')
832
    0.6
833
    >>> dist_overlap('aluminum', 'Catalan')
834
    0.875
835
    >>> dist_overlap('ATCG', 'TAGC')
836
    1.0
837
    """
838
    return 1 - sim_overlap(src, tar, qval)
839
840
841
def sim_tanimoto(src, tar, qval=2):
842
    r"""Return the Tanimoto similarity of two strings.
843
844
    For two sets X and Y, the Tanimoto similarity coefficient
845
    :cite:`Tanimoto:1958` is
846
    :math:`sim_{Tanimoto}(X, Y) = \\frac{|X \\cap Y|}{|X \\cup Y|}`.
847
848
    This is identical to the Jaccard similarity coefficient
849
    :cite:`Jaccard:1901` and the Tversky index :cite:`Tversky:1977` for
850
    :math:`\\alpha = \\beta = 1`.
851
852
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
853
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
854
        version
855
    :returns: Tanimoto similarity
856
    :rtype: float
857
858
    >>> sim_tanimoto('cat', 'hat')
859
    0.3333333333333333
860
    >>> sim_tanimoto('Niall', 'Neil')
861
    0.2222222222222222
862
    >>> sim_tanimoto('aluminum', 'Catalan')
863
    0.0625
864
    >>> sim_tanimoto('ATCG', 'TAGC')
865
    0.0
866
    """
867
    return sim_jaccard(src, tar, qval)
868
869
870
def tanimoto(src, tar, qval=2):
871
    """Return the Tanimoto distance between two strings.
872
873
    Tanimoto distance is :math:`-log_{2}sim_{Tanimoto}`.
874
875
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
876
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
877
        version
878
    :returns: Tanimoto distance
879
    :rtype: float
880
881
    >>> tanimoto('cat', 'hat')
882
    -1.5849625007211563
883
    >>> tanimoto('Niall', 'Neil')
884
    -2.1699250014423126
885
    >>> tanimoto('aluminum', 'Catalan')
886
    -4.0
887
    >>> tanimoto('ATCG', 'TAGC')
888
    -inf
889
    """
890
    coeff = sim_jaccard(src, tar, qval)
891
    if coeff != 0:
892
        return log(coeff, 2)
893
894
    return float('-inf')
895
896
897
def minkowski(src, tar, qval=2, pval=1, normalize=False):
898
    """Return the Minkowski distance (:math:`L^p-norm`) of two strings.
899
900
    The Minkowsky distance :cite:`Minkowski:1910` is a distance metric in
901
    :math:`L^p-space`.
902
903
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
904
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
905
        version
906
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
907
    :param normalize: normalizes to [0, 1] if True
908
    :returns: the Minkowski distance
909
    :rtype: float
910
    """
911
    q_src, q_tar = _get_qgrams(src, tar, qval)
912
    diffs = ((q_src - q_tar) + (q_tar - q_src)).values()
913
914
    normalizer = 1
915
    if normalize:
916
        totals = (q_src + q_tar).values()
917
        if pval == 0:
918
            normalizer = len(totals)
919
        else:
920
            normalizer = sum(_**pval for _ in totals)**(1 / pval)
921
922
    if pval == float('inf'):
923
        # Chebyshev distance
924
        return max(diffs)/normalizer
925
    if pval == 0:
926
        # This is the l_0 "norm" as developed by David Donoho
927
        return len(diffs)
928
    return sum(_**pval for _ in diffs)**(1 / pval)/normalizer
929
930
931
def dist_minkowski(src, tar, qval=2, pval=1):
932
    """Return Minkowski distance of two strings, normalized to [0, 1].
933
934
    The normalized Minkowsky distance :cite:`Minkowski:1910` is a distance
935
    metric in :math:`L^p-space`, normalized to [0, 1].
936
937
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
938
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
939
        version
940
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
941
    :returns: the normalized Minkowski distance
942
    :rtype: float
943
    """
944
    return minkowski(src, tar, qval, pval, True)
945
946
947
def sim_minkowski(src, tar, qval=2, pval=1):
948
    """Return Minkowski similarity of two strings, normalized to [0, 1].
949
950
    Minkowski similarity is the complement of Minkowski distance:
951
    :math:`sim_{Minkowski} = 1 - dist_{Minkowski}`.
952
953
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
954
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
955
        version
956
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
957
    :returns: the normalized Minkowski similarity
958
    :rtype: float
959
    """
960
    return 1-minkowski(src, tar, qval, pval, True)
961
962
963
def manhattan(src, tar, qval=2, normalize=False):
964
    """Return the Manhattan distance between two strings.
965
966
    Manhattan distance is the city-block or taxi-cab distance, equivalent
967
    to Minkowski distance in :math:`L^1`-space.
968
969
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
970
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
971
        version
972
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
973
    :param normalize: normalizes to [0, 1] if True
974
    :returns: the Manhattan distance
975
    :rtype: float
976
    """
977
    return minkowski(src, tar, qval, 1, normalize)
978
979
980
def dist_manhattan(src, tar, qval=2):
981
    """Return the Manhattan distance between two strings, normalized to [0, 1].
982
983
    The normalized Manhattan distance is a distance
984
    metric in :math:`L^1-space`, normalized to [0, 1].
985
986
    This is identical to Canberra distance.
987
988
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
989
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
990
        version
991
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
992
    :returns: the normalized Manhattan distance
993
    :rtype: float
994
    """
995
    return manhattan(src, tar, qval, 1, True)
996
997
998
def sim_manhattan(src, tar, qval=2):
999
    """Return the Manhattan similarity of two strings, normalized to [0, 1].
1000
1001
    Manhattan similarity is the complement of Manhattan distance:
1002
    :math:`sim_{Manhattan} = 1 - dist_{Manhattan}`.
1003
1004
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1005
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1006
        version
1007
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
1008
    :returns: the normalized Manhattan similarity
1009
    :rtype: float
1010
    """
1011
    return 1-manhattan(src, tar, qval, 1, True)
1012
1013
1014
def euclidean(src, tar, qval=2, normalize=False):
1015
    """Return the Euclidean distance between two strings.
1016
1017
    Euclidean distance is the straigh-line or as-the-crow-flies distance,
1018
    equivalent to Minkowski distance in :math:`L^2`-space.
1019
1020
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1021
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1022
        version
1023
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
1024
    :param normalize: normalizes to [0, 1] if True
1025
    :returns: the Euclidean distance
1026
    :rtype: float
1027
    """
1028
    return minkowski(src, tar, qval, 2, normalize)
1029
1030
1031
def dist_euclidean(src, tar, qval=2):
1032
    """Return the Euclidean distance between two strings, normalized to [0, 1].
1033
1034
    The normalized Euclidean distance is a distance
1035
    metric in :math:`L^2-space`, normalized to [0, 1].
1036
1037
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1038
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1039
        version
1040
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
1041
    :returns: the normalized Euclidean distance
1042
    :rtype: float
1043
    """
1044
    return euclidean(src, tar, qval, True)
1045
1046
1047
def sim_euclidean(src, tar, qval=2):
1048
    """Return the Euclidean similarity of two strings, normalized to [0, 1].
1049
1050
    Euclidean similarity is the complement of Euclidean distance:
1051
    :math:`sim_{Euclidean} = 1 - dist_{Euclidean}`.
1052
1053
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1054
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1055
        version
1056
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
1057
    :returns: the normalized Euclidean similarity
1058
    :rtype: float
1059
    """
1060
    return 1-euclidean(src, tar, qval, True)
1061
1062
1063
def chebyshev(src, tar, qval=2, normalize=False):
1064
    r"""Return the Chebyshev distance between two strings.
1065
1066
    Euclidean distance is the chessboard distance,
1067
    equivalent to Minkowski distance in :math:`L^\infty`-space.
1068
1069
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1070
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1071
        version
1072
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
1073
    :param normalize: normalizes to [0, 1] if True
1074
    :returns: the Chebyshev distance
1075
    :rtype: float
1076
    """
1077
    return minkowski(src, tar, qval, float('inf'), normalize)
1078
1079
1080
def dist_chebyshev(src, tar, qval=2):
1081
    """Return the Chebyshev distance between two strings, normalized to [0, 1].
1082
1083
    The normalized Chebyshev distance :cite:`Minkowski:1910` is a distance
1084
    metric in :math:`L^p-space`, normalized to [0, 1].
1085
1086
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1087
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1088
        version
1089
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
1090
    :returns: the normalized Chebyshev distance
1091
    :rtype: float
1092
    """
1093
    return chebyshev(src, tar, qval, True)
1094
1095
1096
def sim_chebyshev(src, tar, qval=2):
1097
    """Return the Chebyshev similarity of two strings, normalized to [0, 1].
1098
1099
    Chebyshev similarity is the complement of Chebyshev distance:
1100
    :math:`sim_{Chebyshev} = 1 - dist_{Chebyshev}`.
1101
1102
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1103
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1104
        version
1105
    :param pval: the :math:`p`-value of the :math:`L^p`-space.
1106
    :returns: the normalized Chebyshev similarity
1107
    :rtype: float
1108
    """
1109
    return 1 - chebyshev(src, tar, qval, True)
1110
1111
1112
def sim_cosine(src, tar, qval=2):
1113
    r"""Return the cosine similarity of two strings.
1114
1115
    For two sets X and Y, the cosine similarity, Otsuka-Ochiai coefficient, or
1116
    Ochiai coefficient :cite:`Otsuka:1936,Ochiai:1957` is:
1117
    :math:`sim_{cosine}(X, Y) = \\frac{|X \\cap Y|}{\\sqrt{|X| \\cdot |Y|}}`.
1118
1119
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1120
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1121
        version
1122
    :returns: cosine similarity
1123
    :rtype: float
1124
1125
    >>> sim_cosine('cat', 'hat')
1126
    0.5
1127
    >>> sim_cosine('Niall', 'Neil')
1128
    0.3651483716701107
1129
    >>> sim_cosine('aluminum', 'Catalan')
1130
    0.11785113019775793
1131
    >>> sim_cosine('ATCG', 'TAGC')
1132
    0.0
1133
    """
1134
    if src == tar:
1135
        return 1.0
1136
    if not src or not tar:
1137
        return 0.0
1138
1139
    q_src, q_tar = _get_qgrams(src, tar, qval)
1140
    q_src_mag = sum(q_src.values())
1141
    q_tar_mag = sum(q_tar.values())
1142
    q_intersection_mag = sum((q_src & q_tar).values())
1143
1144
    return q_intersection_mag / sqrt(q_src_mag * q_tar_mag)
1145
1146
1147
def dist_cosine(src, tar, qval=2):
1148
    """Return the cosine distance between two strings.
1149
1150
    Cosine distance is the complement of cosine similarity:
1151
    :math:`dist_{cosine} = 1 - sim_{cosine}`.
1152
1153
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
1154
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
1155
        version
1156
    :returns: cosine distance
1157
    :rtype: float
1158
1159
    >>> dist_cosine('cat', 'hat')
1160
    0.5
1161
    >>> dist_cosine('Niall', 'Neil')
1162
    0.6348516283298893
1163
    >>> dist_cosine('aluminum', 'Catalan')
1164
    0.882148869802242
1165
    >>> dist_cosine('ATCG', 'TAGC')
1166
    1.0
1167
    """
1168
    return 1 - sim_cosine(src, tar, qval)
1169
1170
1171
def sim_strcmp95(src, tar, long_strings=False):
1172
    """Return the strcmp95 similarity of two strings.
1173
1174
    This is a Python translation of the C code for strcmp95:
1175
    http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
1176
    :cite:`Winkler:1994`.
1177
    The above file is a US Government publication and, accordingly,
1178
    in the public domain.
1179
1180
    This is based on the Jaro-Winkler distance, but also attempts to correct
1181
    for some common typos and frequently confused characters. It is also
1182
    limited to uppercase ASCII characters, so it is appropriate to American
1183
    names, but not much else.
1184
1185
    :param str src, tar: two strings to be compared
1186
    :param bool long_strings: set to True to "Increase the probability of a
1187
        match when the number of matched characters is large.  This option
1188
        allows for a little more tolerance when the strings are large. It is
1189
        not an appropriate test when comparing fixed length fields such as
1190
        phone and social security numbers."
1191
    :returns: strcmp95 similarity
1192
    :rtype: float
1193
1194
    >>> sim_strcmp95('cat', 'hat')
1195
    0.7777777777777777
1196
    >>> sim_strcmp95('Niall', 'Neil')
1197
    0.8454999999999999
1198
    >>> sim_strcmp95('aluminum', 'Catalan')
1199
    0.6547619047619048
1200
    >>> sim_strcmp95('ATCG', 'TAGC')
1201
    0.8333333333333334
1202
    """
1203
    def _inrange(char):
1204
        """Return True if char is in the range (0, 91)."""
1205
        return ord(char) > 0 and ord(char) < 91
1206
1207
    ying = src.strip().upper()
1208
    yang = tar.strip().upper()
1209
1210
    if ying == yang:
1211
        return 1.0
1212
    # If either string is blank - return - added in Version 2
1213
    if not ying or not yang:
1214
        return 0.0
1215
1216
    adjwt = defaultdict(int)
1217
    sp_mx = (
1218
        ('A', 'E'), ('A', 'I'), ('A', 'O'), ('A', 'U'), ('B', 'V'), ('E', 'I'),
1219
        ('E', 'O'), ('E', 'U'), ('I', 'O'), ('I', 'U'), ('O', 'U'), ('I', 'Y'),
1220
        ('E', 'Y'), ('C', 'G'), ('E', 'F'), ('W', 'U'), ('W', 'V'), ('X', 'K'),
1221
        ('S', 'Z'), ('X', 'S'), ('Q', 'C'), ('U', 'V'), ('M', 'N'), ('L', 'I'),
1222
        ('Q', 'O'), ('P', 'R'), ('I', 'J'), ('2', 'Z'), ('5', 'S'), ('8', 'B'),
1223
        ('1', 'I'), ('1', 'L'), ('0', 'O'), ('0', 'Q'), ('C', 'K'), ('G', 'J')
1224
    )
1225
1226
    # Initialize the adjwt array on the first call to the function only.
1227
    # The adjwt array is used to give partial credit for characters that
1228
    # may be errors due to known phonetic or character recognition errors.
1229
    # A typical example is to match the letter "O" with the number "0"
1230
    for i in sp_mx:
1231
        adjwt[(i[0], i[1])] = 3
1232
        adjwt[(i[1], i[0])] = 3
1233
1234
    if len(ying) > len(yang):
1235
        search_range = len(ying)
1236
        minv = len(yang)
1237
    else:
1238
        search_range = len(yang)
1239
        minv = len(ying)
1240
1241
    # Blank out the flags
1242
    ying_flag = [0] * search_range
1243
    yang_flag = [0] * search_range
1244
    search_range = max(0, search_range // 2 - 1)
1245
1246
    # Looking only within the search range, count and flag the matched pairs.
1247
    num_com = 0
1248
    yl1 = len(yang) - 1
1249
    for i in range(len(ying)):
1250
        lowlim = (i - search_range) if (i >= search_range) else 0
1251
        hilim = (i + search_range) if ((i + search_range) <= yl1) else yl1
1252
        for j in range(lowlim, hilim+1):
1253
            if (yang_flag[j] == 0) and (yang[j] == ying[i]):
1254
                yang_flag[j] = 1
1255
                ying_flag[i] = 1
1256
                num_com += 1
1257
                break
1258
1259
    # If no characters in common - return
1260
    if num_com == 0:
1261
        return 0.0
1262
1263
    # Count the number of transpositions
1264
    k = n_trans = 0
1265
    for i in range(len(ying)):
1266
        if ying_flag[i] != 0:
1267
            for j in range(k, len(yang)):
1268
                if yang_flag[j] != 0:
1269
                    k = j + 1
1270
                    break
1271
            if ying[i] != yang[j]:
0 ignored issues
show
introduced by
The variable j does not seem to be defined for all execution paths.
Loading history...
1272
                n_trans += 1
1273
    n_trans = n_trans // 2
1274
1275
    # Adjust for similarities in unmatched characters
1276
    n_simi = 0
1277
    if minv > num_com:
1278
        for i in range(len(ying)):
1279
            if ying_flag[i] == 0 and _inrange(ying[i]):
1280
                for j in range(len(yang)):
1281
                    if yang_flag[j] == 0 and _inrange(yang[j]):
1282
                        if (ying[i], yang[j]) in adjwt:
1283
                            n_simi += adjwt[(ying[i], yang[j])]
1284
                            yang_flag[j] = 2
1285
                            break
1286
    num_sim = n_simi/10.0 + num_com
1287
1288
    # Main weight computation
1289
    weight = num_sim / len(ying) + num_sim / len(yang) + \
1290
        (num_com - n_trans) / num_com
1291
    weight = weight / 3.0
1292
1293
    # Continue to boost the weight if the strings are similar
1294
    if weight > 0.7:
1295
1296
        # Adjust for having up to the first 4 characters in common
1297
        j = 4 if (minv >= 4) else minv
1298
        i = 0
1299
        while (i < j) and (ying[i] == yang[i]) and (not ying[i].isdigit()):
1300
            i += 1
1301
        if i:
1302
            weight += i * 0.1 * (1.0 - weight)
1303
1304
        # Optionally adjust for long strings.
1305
1306
        # After agreeing beginning chars, at least two more must agree and
1307
        # the agreeing characters must be > .5 of remaining characters.
1308
        if (((long_strings) and (minv > 4) and (num_com > i+1) and
1309
             (2*num_com >= minv+i))):
1310
            if not ying[0].isdigit():
1311
                weight += (1.0-weight) * ((num_com-i-1) /
1312
                                          (len(ying)+len(yang)-i*2+2))
1313
1314
    return weight
1315
1316
1317
def dist_strcmp95(src, tar, long_strings=False):
1318
    """Return the strcmp95 distance between two strings.
1319
1320
    strcmp95 distance is the complement of strcmp95 similarity:
1321
    :math:`dist_{strcmp95} = 1 - sim_{strcmp95}`.
1322
1323
    :param str src, tar: two strings to be compared
1324
    :param bool long_strings: set to True to "Increase the probability of a
1325
        match when the number of matched characters is large.  This option
1326
        allows for a little more tolerance when the strings are large. It is
1327
        not an appropriate test when comparing fixed length fields such as
1328
        phone and social security numbers."
1329
    :returns: strcmp95 distance
1330
    :rtype: float
1331
1332
    >>> round(dist_strcmp95('cat', 'hat'), 12)
1333
    0.222222222222
1334
    >>> round(dist_strcmp95('Niall', 'Neil'), 12)
1335
    0.1545
1336
    >>> round(dist_strcmp95('aluminum', 'Catalan'), 12)
1337
    0.345238095238
1338
    >>> round(dist_strcmp95('ATCG', 'TAGC'), 12)
1339
    0.166666666667
1340
    """
1341
    return 1 - sim_strcmp95(src, tar, long_strings)
1342
1343
1344
def sim_jaro_winkler(src, tar, qval=1, mode='winkler', long_strings=False,
1345
                     boost_threshold=0.7, scaling_factor=0.1):
1346
    """Return the Jaro or Jaro-Winkler similarity of two strings.
1347
1348
    Jaro(-Winkler) distance is a string edit distance initially proposed by
1349
    Jaro and extended by Winkler :cite:`Jaro:1989,Winkler:1990`.
1350
1351
    This is Python based on the C code for strcmp95:
1352
    http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
1353
    :cite:`Winkler:1994`. The above file is a US Government publication and,
1354
    accordingly, in the public domain.
1355
1356
    :param str src, tar: two strings to be compared
1357
    :param int qval: the length of each q-gram (defaults to 1: character-wise
1358
        matching)
1359
    :param str mode: indicates which variant of this distance metric to
1360
        compute:
1361
1362
            - 'winkler' -- computes the Jaro-Winkler distance (default) which
1363
              increases the score for matches near the start of the word
1364
            - 'jaro' -- computes the Jaro distance
1365
1366
    The following arguments apply only when mode is 'winkler':
1367
1368
    :param bool long_strings: set to True to "Increase the probability of a
1369
        match when the number of matched characters is large.  This option
1370
        allows for a little more tolerance when the strings are large.  It is
1371
        not an appropriate test when comparing fixed length fields such as
1372
        phone and social security numbers."
1373
    :param float boost_threshold: a value between 0 and 1, below which the
1374
        Winkler boost is not applied (defaults to 0.7)
1375
    :param float scaling_factor: a value between 0 and 0.25, indicating by how
1376
        much to boost scores for matching prefixes (defaults to 0.1)
1377
1378
    :returns: Jaro or Jaro-Winkler similarity
1379
    :rtype: float
1380
1381
    >>> round(sim_jaro_winkler('cat', 'hat'), 12)
1382
    0.777777777778
1383
    >>> round(sim_jaro_winkler('Niall', 'Neil'), 12)
1384
    0.805
1385
    >>> round(sim_jaro_winkler('aluminum', 'Catalan'), 12)
1386
    0.60119047619
1387
    >>> round(sim_jaro_winkler('ATCG', 'TAGC'), 12)
1388
    0.833333333333
1389
1390
    >>> round(sim_jaro_winkler('cat', 'hat', mode='jaro'), 12)
1391
    0.777777777778
1392
    >>> round(sim_jaro_winkler('Niall', 'Neil', mode='jaro'), 12)
1393
    0.783333333333
1394
    >>> round(sim_jaro_winkler('aluminum', 'Catalan', mode='jaro'), 12)
1395
    0.60119047619
1396
    >>> round(sim_jaro_winkler('ATCG', 'TAGC', mode='jaro'), 12)
1397
    0.833333333333
1398
    """
1399
    if mode == 'winkler':
1400
        if boost_threshold > 1 or boost_threshold < 0:
1401
            raise ValueError('Unsupported boost_threshold assignment; ' +
1402
                             'boost_threshold must be between 0 and 1.')
1403
        if scaling_factor > 0.25 or scaling_factor < 0:
1404
            raise ValueError('Unsupported scaling_factor assignment; ' +
1405
                             'scaling_factor must be between 0 and 0.25.')
1406
1407
    if src == tar:
1408
        return 1.0
1409
1410
    src = QGrams(src.strip(), qval).ordered_list
1411
    tar = QGrams(tar.strip(), qval).ordered_list
1412
1413
    lens = len(src)
1414
    lent = len(tar)
1415
1416
    # If either string is blank - return - added in Version 2
1417
    if lens == 0 or lent == 0:
1418
        return 0.0
1419
1420
    if lens > lent:
1421
        search_range = lens
1422
        minv = lent
1423
    else:
1424
        search_range = lent
1425
        minv = lens
1426
1427
    # Zero out the flags
1428
    src_flag = [0] * search_range
1429
    tar_flag = [0] * search_range
1430
    search_range = max(0, search_range//2 - 1)
1431
1432
    # Looking only within the search range, count and flag the matched pairs.
1433
    num_com = 0
1434
    yl1 = lent - 1
1435
    for i in range(lens):
1436
        lowlim = (i - search_range) if (i >= search_range) else 0
1437
        hilim = (i + search_range) if ((i + search_range) <= yl1) else yl1
1438
        for j in range(lowlim, hilim+1):
1439
            if (tar_flag[j] == 0) and (tar[j] == src[i]):
1440
                tar_flag[j] = 1
1441
                src_flag[i] = 1
1442
                num_com += 1
1443
                break
1444
1445
    # If no characters in common - return
1446
    if num_com == 0:
1447
        return 0.0
1448
1449
    # Count the number of transpositions
1450
    k = n_trans = 0
1451
    for i in range(lens):
1452
        if src_flag[i] != 0:
1453
            for j in range(k, lent):
1454
                if tar_flag[j] != 0:
1455
                    k = j + 1
1456
                    break
1457
            if src[i] != tar[j]:
0 ignored issues
show
introduced by
The variable j does not seem to be defined for all execution paths.
Loading history...
1458
                n_trans += 1
1459
    n_trans = n_trans // 2
1460
1461
    # Main weight computation for Jaro distance
1462
    weight = num_com / lens + num_com / lent + (num_com - n_trans) / num_com
1463
    weight = weight / 3.0
1464
1465
    # Continue to boost the weight if the strings are similar
1466
    # This is the Winkler portion of Jaro-Winkler distance
1467
    if mode == 'winkler' and weight > boost_threshold:
1468
1469
        # Adjust for having up to the first 4 characters in common
1470
        j = 4 if (minv >= 4) else minv
1471
        i = 0
1472
        while (i < j) and (src[i] == tar[i]):
1473
            i += 1
1474
        if i:
1475
            weight += i * scaling_factor * (1.0 - weight)
1476
1477
        # Optionally adjust for long strings.
1478
1479
        # After agreeing beginning chars, at least two more must agree and
1480
        # the agreeing characters must be > .5 of remaining characters.
1481
        if (((long_strings) and (minv > 4) and (num_com > i+1) and
1482
             (2*num_com >= minv+i))):
1483
            weight += (1.0-weight) * ((num_com-i-1) / (lens+lent-i*2+2))
1484
1485
    return weight
1486
1487
1488
def dist_jaro_winkler(src, tar, qval=1, mode='winkler', long_strings=False,
1489
                      boost_threshold=0.7, scaling_factor=0.1):
1490
    """Return the Jaro or Jaro-Winkler distance between two strings.
1491
1492
    Jaro(-Winkler) similarity is the complement of Jaro(-Winkler) distance:
1493
    :math:`sim_{Jaro(-Winkler)} = 1 - dist_{Jaro(-Winkler)}`.
1494
1495
    :param str src, tar: two strings to be compared
1496
    :param int qval: the length of each q-gram (defaults to 1: character-wise
1497
        matching)
1498
    :param str mode: indicates which variant of this distance metric to
1499
        compute:
1500
1501
            - 'winkler' -- computes the Jaro-Winkler distance (default) which
1502
              increases the score for matches near the start of the word
1503
            - 'jaro' -- computes the Jaro distance
1504
1505
    The following arguments apply only when mode is 'winkler':
1506
1507
    :param bool long_strings: set to True to "Increase the probability of a
1508
        match when the number of matched characters is large.  This option
1509
        allows for a little more tolerance when the strings are large.  It is
1510
        not an appropriate test when comparing fixed length fields such as
1511
        phone and social security numbers."
1512
    :param float boost_threshold: a value between 0 and 1, below which the
1513
        Winkler boost is not applied (defaults to 0.7)
1514
    :param float scaling_factor: a value between 0 and 0.25, indicating by how
1515
        much to boost scores for matching prefixes (defaults to 0.1)
1516
1517
    :returns: Jaro or Jaro-Winkler distance
1518
    :rtype: float
1519
1520
    >>> round(dist_jaro_winkler('cat', 'hat'), 12)
1521
    0.222222222222
1522
    >>> round(dist_jaro_winkler('Niall', 'Neil'), 12)
1523
    0.195
1524
    >>> round(dist_jaro_winkler('aluminum', 'Catalan'), 12)
1525
    0.39880952381
1526
    >>> round(dist_jaro_winkler('ATCG', 'TAGC'), 12)
1527
    0.166666666667
1528
1529
    >>> round(dist_jaro_winkler('cat', 'hat', mode='jaro'), 12)
1530
    0.222222222222
1531
    >>> round(dist_jaro_winkler('Niall', 'Neil', mode='jaro'), 12)
1532
    0.216666666667
1533
    >>> round(dist_jaro_winkler('aluminum', 'Catalan', mode='jaro'), 12)
1534
    0.39880952381
1535
    >>> round(dist_jaro_winkler('ATCG', 'TAGC', mode='jaro'), 12)
1536
    0.166666666667
1537
    """
1538
    return 1 - sim_jaro_winkler(src, tar, qval, mode, long_strings,
1539
                                boost_threshold, scaling_factor)
1540
1541
1542
def lcsseq(src, tar):
1543
    """Return the longest common subsequence of two strings.
1544
1545
    Longest common subsequence (LCSseq) is the longest subsequence of
1546
    characters that two strings have in common.
1547
1548
    Based on the dynamic programming algorithm from
1549
    http://rosettacode.org/wiki/Longest_common_subsequence#Dynamic_Programming_6
1550
    :cite:`rosettacode:2018b`. This is licensed GFDL 1.2.
1551
1552
    Modifications include:
1553
        conversion to a numpy array in place of a list of lists
1554
1555
    :param str src, tar: two strings to be compared
1556
    :returns: the longes common subsequence
1557
    :rtype: str
1558
1559
    >>> lcsseq('cat', 'hat')
1560
    'at'
1561
    >>> lcsseq('Niall', 'Neil')
1562
    'Nil'
1563
    >>> lcsseq('aluminum', 'Catalan')
1564
    'aln'
1565
    >>> lcsseq('ATCG', 'TAGC')
1566
    'AC'
1567
    """
1568
    # pylint: disable=no-member
1569
    lengths = np_zeros((len(src)+1, len(tar)+1), dtype=np_int)
1570
    # pylint: enable=no-member
1571
1572
    # row 0 and column 0 are initialized to 0 already
1573
    for i, src_char in enumerate(src):
1574
        for j, tar_char in enumerate(tar):
1575
            if src_char == tar_char:
1576
                lengths[i+1, j+1] = lengths[i, j] + 1
1577
            else:
1578
                lengths[i+1, j+1] = max(lengths[i+1, j], lengths[i, j+1])
1579
1580
    # read the substring out from the matrix
1581
    result = ''
1582
    i, j = len(src), len(tar)
1583
    while i != 0 and j != 0:
1584
        if lengths[i, j] == lengths[i-1, j]:
1585
            i -= 1
1586
        elif lengths[i, j] == lengths[i, j-1]:
1587
            j -= 1
1588
        else:
1589
            result = src[i-1] + result
1590
            i -= 1
1591
            j -= 1
1592
    return result
1593
1594
1595
def sim_lcsseq(src, tar):
1596
    r"""Return the longest common subsequence similarity of two strings.
1597
1598
    Longest common subsequence similarity (:math:`sim_{LCSseq}`).
1599
1600
    This employs the LCSseq function to derive a similarity metric:
1601
    :math:`sim_{LCSseq}(s,t) = \\frac{|LCSseq(s,t)|}{max(|s|, |t|)}`
1602
1603
    :param str src, tar: two strings to be compared
1604
    :returns: LCSseq similarity
1605
    :rtype: float
1606
1607
    >>> sim_lcsseq('cat', 'hat')
1608
    0.6666666666666666
1609
    >>> sim_lcsseq('Niall', 'Neil')
1610
    0.6
1611
    >>> sim_lcsseq('aluminum', 'Catalan')
1612
    0.375
1613
    >>> sim_lcsseq('ATCG', 'TAGC')
1614
    0.5
1615
    """
1616
    if src == tar:
1617
        return 1.0
1618
    elif not src or not tar:
1619
        return 0.0
1620
    return len(lcsseq(src, tar)) / max(len(src), len(tar))
1621
1622
1623
def dist_lcsseq(src, tar):
1624
    """Return the longest common subsequence distance between two strings.
1625
1626
    Longest common subsequence distance (:math:`dist_{LCSseq}`).
1627
1628
    This employs the LCSseq function to derive a similarity metric:
1629
    :math:`dist_{LCSseq}(s,t) = 1 - sim_{LCSseq}(s,t)`
1630
1631
    :param str src, tar: two strings to be compared
1632
    :returns: LCSseq distance
1633
    :rtype: float
1634
1635
    >>> dist_lcsseq('cat', 'hat')
1636
    0.33333333333333337
1637
    >>> dist_lcsseq('Niall', 'Neil')
1638
    0.4
1639
    >>> dist_lcsseq('aluminum', 'Catalan')
1640
    0.625
1641
    >>> dist_lcsseq('ATCG', 'TAGC')
1642
    0.5
1643
    """
1644
    return 1 - sim_lcsseq(src, tar)
1645
1646
1647
def lcsstr(src, tar):
1648
    """Return the longest common substring of two strings.
1649
1650
    Longest common substring (LCSstr).
1651
1652
    Based on the code from
1653
    https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring#Python
1654
    :cite:`Wikibooks:2018`.
1655
    This is licensed Creative Commons: Attribution-ShareAlike 3.0.
1656
1657
    Modifications include:
1658
1659
        - conversion to a numpy array in place of a list of lists
1660
        - conversion to Python 2/3-safe range from xrange via six
1661
1662
    :param str src, tar: two strings to be compared
1663
    :returns: the longes common substring
1664
    :rtype: float
1665
1666
    >>> lcsstr('cat', 'hat')
1667
    'at'
1668
    >>> lcsstr('Niall', 'Neil')
1669
    'N'
1670
    >>> lcsstr('aluminum', 'Catalan')
1671
    'al'
1672
    >>> lcsstr('ATCG', 'TAGC')
1673
    'A'
1674
    """
1675
    # pylint: disable=no-member
1676
    lengths = np_zeros((len(src)+1, len(tar)+1), dtype=np_int)
1677
    # pylint: enable=no-member
1678
    longest, i_longest = 0, 0
1679
    for i in range(1, len(src)+1):
1680
        for j in range(1, len(tar)+1):
1681
            if src[i-1] == tar[j-1]:
1682
                lengths[i, j] = lengths[i-1, j-1] + 1
1683
                if lengths[i, j] > longest:
1684
                    longest = lengths[i, j]
1685
                    i_longest = i
1686
            else:
1687
                lengths[i, j] = 0
1688
    return src[i_longest - longest:i_longest]
1689
1690
1691
def sim_lcsstr(src, tar):
1692
    r"""Return the longest common substring similarity of two strings.
1693
1694
    Longest common substring similarity (:math:`sim_{LCSstr}`).
1695
1696
    This employs the LCS function to derive a similarity metric:
1697
    :math:`sim_{LCSstr}(s,t) = \\frac{|LCSstr(s,t)|}{max(|s|, |t|)}`
1698
1699
    :param str src, tar: two strings to be compared
1700
    :returns: LCSstr similarity
1701
    :rtype: float
1702
1703
    >>> sim_lcsstr('cat', 'hat')
1704
    0.6666666666666666
1705
    >>> sim_lcsstr('Niall', 'Neil')
1706
    0.2
1707
    >>> sim_lcsstr('aluminum', 'Catalan')
1708
    0.25
1709
    >>> sim_lcsstr('ATCG', 'TAGC')
1710
    0.25
1711
    """
1712
    if src == tar:
1713
        return 1.0
1714
    elif not src or not tar:
1715
        return 0.0
1716
    return len(lcsstr(src, tar)) / max(len(src), len(tar))
1717
1718
1719
def dist_lcsstr(src, tar):
1720
    """Return the longest common substring distance between two strings.
1721
1722
    Longest common substring distance (:math:`dist_{LCSstr}`).
1723
1724
    This employs the LCS function to derive a similarity metric:
1725
    :math:`dist_{LCSstr}(s,t) = 1 - sim_{LCSstr}(s,t)`
1726
1727
    :param str src, tar: two strings to be compared
1728
    :returns: LCSstr distance
1729
    :rtype: float
1730
1731
    >>> dist_lcsstr('cat', 'hat')
1732
    0.33333333333333337
1733
    >>> dist_lcsstr('Niall', 'Neil')
1734
    0.8
1735
    >>> dist_lcsstr('aluminum', 'Catalan')
1736
    0.75
1737
    >>> dist_lcsstr('ATCG', 'TAGC')
1738
    0.75
1739
    """
1740
    return 1 - sim_lcsstr(src, tar)
1741
1742
1743
def sim_ratcliff_obershelp(src, tar):
1744
    """Return the Ratcliff-Obershelp similarity of two strings.
1745
1746
    This follows the Ratcliff-Obershelp algorithm :cite:`Ratcliff:1988` to
1747
    derive a similarity measure:
1748
1749
        1. Find the length of the longest common substring in src & tar.
1750
        2. Recurse on the strings to the left & right of each this substring
1751
           in src & tar. The base case is a 0 length common substring, in which
1752
           case, return 0. Otherwise, return the sum of the current longest
1753
           common substring and the left & right recursed sums.
1754
        3. Multiply this length by 2 and divide by the sum of the lengths of
1755
           src & tar.
1756
1757
    Cf.
1758
    http://www.drdobbs.com/database/pattern-matching-the-gestalt-approach/184407970
1759
1760
    :param str src, tar: two strings to be compared
1761
    :returns: Ratcliff-Obserhelp similarity
1762
    :rtype: float
1763
1764
    >>> round(sim_ratcliff_obershelp('cat', 'hat'), 12)
1765
    0.666666666667
1766
    >>> round(sim_ratcliff_obershelp('Niall', 'Neil'), 12)
1767
    0.666666666667
1768
    >>> round(sim_ratcliff_obershelp('aluminum', 'Catalan'), 12)
1769
    0.4
1770
    >>> sim_ratcliff_obershelp('ATCG', 'TAGC')
1771
    0.5
1772
    """
1773
    def _lcsstr_stl(src, tar):
1774
        """Return start positions & length for Ratcliff-Obershelp.
1775
1776
        Return the start position in the source string, start position in
1777
        the target string, and length of the longest common substring of
1778
        strings src and tar.
1779
        """
1780
        # pylint: disable=no-member
1781
        lengths = np_zeros((len(src)+1, len(tar)+1), dtype=np_int)
1782
        # pylint: enable=no-member
1783
        longest, src_longest, tar_longest = 0, 0, 0
1784
        for i in range(1, len(src)+1):
1785
            for j in range(1, len(tar)+1):
1786
                if src[i-1] == tar[j-1]:
1787
                    lengths[i, j] = lengths[i-1, j-1] + 1
1788
                    if lengths[i, j] > longest:
1789
                        longest = lengths[i, j]
1790
                        src_longest = i
1791
                        tar_longest = j
1792
                else:
1793
                    lengths[i, j] = 0
1794
        return (src_longest-longest, tar_longest-longest, longest)
1795
1796
    def _sstr_matches(src, tar):
1797
        """Return the sum of substring match lengths.
1798
1799
        This follows the Ratcliff-Obershelp algorithm :cite:`Ratcliff:1988`:
1800
             1. Find the length of the longest common substring in src & tar.
1801
             2. Recurse on the strings to the left & right of each this
1802
                 substring in src & tar.
1803
             3. Base case is a 0 length common substring, in which case,
1804
                 return 0.
1805
             4. Return the sum.
1806
        """
1807
        src_start, tar_start, length = _lcsstr_stl(src, tar)
1808
        if length == 0:
1809
            return 0
1810
        return (_sstr_matches(src[:src_start], tar[:tar_start]) +
1811
                length +
1812
                _sstr_matches(src[src_start+length:], tar[tar_start+length:]))
1813
1814
    if src == tar:
1815
        return 1.0
1816
    elif not src or not tar:
1817
        return 0.0
1818
    return 2*_sstr_matches(src, tar)/(len(src)+len(tar))
1819
1820
1821
def dist_ratcliff_obershelp(src, tar):
1822
    """Return the Ratcliff-Obershelp distance between two strings.
1823
1824
    Ratcliff-Obsershelp distance the complement of Ratcliff-Obershelp
1825
    similarity:
1826
    :math:`dist_{Ratcliff-Obershelp} = 1 - sim_{Ratcliff-Obershelp}`.
1827
1828
    :param str src, tar: two strings to be compared
1829
    :returns: Ratcliffe-Obershelp distance
1830
    :rtype: float
1831
1832
    >>> round(dist_ratcliff_obershelp('cat', 'hat'), 12)
1833
    0.333333333333
1834
    >>> round(dist_ratcliff_obershelp('Niall', 'Neil'), 12)
1835
    0.333333333333
1836
    >>> round(dist_ratcliff_obershelp('aluminum', 'Catalan'), 12)
1837
    0.6
1838
    >>> dist_ratcliff_obershelp('ATCG', 'TAGC')
1839
    0.5
1840
    """
1841
    return 1 - sim_ratcliff_obershelp(src, tar)
1842
1843
1844
def mra_compare(src, tar):
1845
    """Return the MRA comparison rating of two strings.
1846
1847
    The Western Airlines Surname Match Rating Algorithm comparison rating, as
1848
    presented on page 18 of :cite:`Moore:1977`.
1849
1850
    :param str src, tar: two strings to be compared
1851
    :returns: MRA comparison rating
1852
    :rtype: int
1853
1854
    >>> mra_compare('cat', 'hat')
1855
    5
1856
    >>> mra_compare('Niall', 'Neil')
1857
    6
1858
    >>> mra_compare('aluminum', 'Catalan')
1859
    0
1860
    >>> mra_compare('ATCG', 'TAGC')
1861
    5
1862
    """
1863
    if src == tar:
1864
        return 6
1865
    if src == '' or tar == '':
1866
        return 0
1867
    src = list(mra(src))
1868
    tar = list(mra(tar))
1869
1870
    if abs(len(src)-len(tar)) > 2:
1871
        return 0
1872
1873
    length_sum = len(src) + len(tar)
1874
    if length_sum < 5:
1875
        min_rating = 5
1876
    elif length_sum < 8:
1877
        min_rating = 4
1878
    elif length_sum < 12:
1879
        min_rating = 3
1880
    else:
1881
        min_rating = 2
1882
1883
    for _ in range(2):
1884
        new_src = []
1885
        new_tar = []
1886
        minlen = min(len(src), len(tar))
1887
        for i in range(minlen):
1888
            if src[i] != tar[i]:
1889
                new_src.append(src[i])
1890
                new_tar.append(tar[i])
1891
        src = new_src+src[minlen:]
1892
        tar = new_tar+tar[minlen:]
1893
        src.reverse()
1894
        tar.reverse()
1895
1896
    similarity = 6 - max(len(src), len(tar))
1897
1898
    if similarity >= min_rating:
1899
        return similarity
1900
    return 0
1901
1902
1903
def sim_mra(src, tar):
1904
    """Return the normalized MRA similarity of two strings.
1905
1906
    This is the MRA normalized to :math:`[0, 1]`, given that MRA itself is
1907
    constrained to the range :math:`[0, 6]`.
1908
1909
    :param str src, tar: two strings to be compared
1910
    :returns: normalized MRA similarity
1911
    :rtype: float
1912
1913
    >>> sim_mra('cat', 'hat')
1914
    0.8333333333333334
1915
    >>> sim_mra('Niall', 'Neil')
1916
    1.0
1917
    >>> sim_mra('aluminum', 'Catalan')
1918
    0.0
1919
    >>> sim_mra('ATCG', 'TAGC')
1920
    0.8333333333333334
1921
    """
1922
    return mra_compare(src, tar)/6
1923
1924
1925
def dist_mra(src, tar):
1926
    """Return the normalized MRA distance between two strings.
1927
1928
    MRA distance is the complement of MRA similarity:
1929
    :math:`dist_{MRA} = 1 - sim_{MRA}`.
1930
1931
    :param str src, tar: two strings to be compared
1932
    :returns: normalized MRA distance
1933
    :rtype: float
1934
1935
    >>> dist_mra('cat', 'hat')
1936
    0.16666666666666663
1937
    >>> dist_mra('Niall', 'Neil')
1938
    0.0
1939
    >>> dist_mra('aluminum', 'Catalan')
1940
    1.0
1941
    >>> dist_mra('ATCG', 'TAGC')
1942
    0.16666666666666663
1943
    """
1944
    return 1 - sim_mra(src, tar)
1945
1946
1947
def dist_compression(src, tar, compressor='bz2', probs=None):
1948
    """Return the normalized compression distance between two strings.
1949
1950
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
1951
1952
    :param str src, tar: two strings to be compared
1953
    :param str compressor: a compression scheme to use for the similarity
1954
        calculation, from the following:
1955
1956
            - `zlib` -- standard zlib/gzip
1957
            - `bz2` -- bzip2 (default)
1958
            - `lzma` -- Lempel–Ziv–Markov chain algorithm
1959
            - `arith` -- arithmetic coding
1960
            - `rle` -- run-length encoding
1961
            - `bwtrle` -- Burrows-Wheeler transform followed by run-length
1962
              encoding
1963
1964
    :param doct probs: a dictionary trained with ac_train (for the arith
1965
        compressor only)
1966
    :returns: compression distance
1967
    :rtype: float
1968
1969
    >>> dist_compression('cat', 'hat')
1970
    0.08
1971
    >>> dist_compression('Niall', 'Neil')
1972
    0.037037037037037035
1973
    >>> dist_compression('aluminum', 'Catalan')
1974
    0.20689655172413793
1975
    >>> dist_compression('ATCG', 'TAGC')
1976
    0.037037037037037035
1977
1978
    >>> dist_compression('Niall', 'Neil', compressor='zlib')
1979
    0.45454545454545453
1980
    >>> dist_compression('Niall', 'Neil', compressor='bz2')
1981
    0.037037037037037035
1982
    >>> dist_compression('Niall', 'Neil', compressor='lzma')
1983
    0.16
1984
    >>> dist_compression('Niall', 'Neil', compressor='arith')
1985
    0.6875
1986
    >>> dist_compression('Niall', 'Neil', compressor='rle')
1987
    1.0
1988
    >>> dist_compression('Niall', 'Neil', compressor='bwtrle')
1989
    0.8333333333333334
1990
    """
1991
    if src == tar:
1992
        return 0.0
1993
1994
    if compressor not in {'arith', 'rle', 'bwtrle'}:
1995
        src = src.encode('utf-8')
1996
        tar = tar.encode('utf-8')
1997
1998
    if compressor == 'bz2':
1999
        src_comp = encode(src, 'bz2_codec')[15:]
2000
        tar_comp = encode(tar, 'bz2_codec')[15:]
2001
        concat_comp = encode(src+tar, 'bz2_codec')[15:]
2002
        concat_comp2 = encode(tar+src, 'bz2_codec')[15:]
2003
    elif compressor == 'lzma':
2004
        if 'lzma' in modules:
2005
            src_comp = lzma.compress(src)[14:]
2006
            tar_comp = lzma.compress(tar)[14:]
2007
            concat_comp = lzma.compress(src+tar)[14:]
2008
            concat_comp2 = lzma.compress(tar+src)[14:]
2009
        else:  # pragma: no cover
2010
            raise ValueError('Install the PylibLZMA module in order to use ' +
2011
                             'lzma compression similarity')
2012
    elif compressor == 'arith':
2013
        if probs is None:
2014
            # lacking a reasonable dictionary, train on the strings themselves
2015
            probs = ac_train(src+tar)
2016
        src_comp = ac_encode(src, probs)[1]
2017
        tar_comp = ac_encode(tar, probs)[1]
2018
        concat_comp = ac_encode(src+tar, probs)[1]
2019
        concat_comp2 = ac_encode(tar+src, probs)[1]
2020
        return ((min(concat_comp, concat_comp2) - min(src_comp, tar_comp)) /
2021
                max(src_comp, tar_comp))
2022
    elif compressor in {'rle', 'bwtrle'}:
2023
        src_comp = rle_encode(src, (compressor == 'bwtrle'))
2024
        tar_comp = rle_encode(tar, (compressor == 'bwtrle'))
2025
        concat_comp = rle_encode(src+tar, (compressor == 'bwtrle'))
2026
        concat_comp2 = rle_encode(tar+src, (compressor == 'bwtrle'))
2027
    else:  # zlib
2028
        src_comp = encode(src, 'zlib_codec')[2:]
2029
        tar_comp = encode(tar, 'zlib_codec')[2:]
2030
        concat_comp = encode(src+tar, 'zlib_codec')[2:]
2031
        concat_comp2 = encode(tar+src, 'zlib_codec')[2:]
2032
    return ((min(len(concat_comp), len(concat_comp2)) -
2033
             min(len(src_comp), len(tar_comp))) /
2034
            max(len(src_comp), len(tar_comp)))
2035
2036
2037
def sim_compression(src, tar, compressor='bz2', probs=None):
2038
    """Return the normalized compression similarity of two strings.
2039
2040
    Normalized compression similarity is the complement of normalized
2041
    compression distance:
2042
    :math:`sim_{NCS} = 1 - dist_{NCD}`.
2043
2044
    :param str src, tar: two strings to be compared
2045
    :param str compressor: a compression scheme to use for the similarity
2046
        calculation:
2047
2048
            - `zlib` -- standard zlib/gzip
2049
            - `bz2` -- bzip2 (default)
2050
            - `lzma` -- Lempel–Ziv–Markov chain algorithm
2051
            - `arith` -- arithmetic coding
2052
            - `rle` -- run-length encoding
2053
            - `bwtrle` -- Burrows-Wheeler transform followed by run-length
2054
              encoding
2055
2056
    :param dict probs: a dictionary trained with ac_train (for the arith
2057
        compressor only)
2058
    :returns: compression similarity
2059
    :rtype: float
2060
2061
    >>> sim_compression('cat', 'hat')
2062
    0.92
2063
    >>> sim_compression('Niall', 'Neil')
2064
    0.962962962962963
2065
    >>> sim_compression('aluminum', 'Catalan')
2066
    0.7931034482758621
2067
    >>> sim_compression('ATCG', 'TAGC')
2068
    0.962962962962963
2069
2070
    >>> sim_compression('Niall', 'Neil', compressor='zlib')
2071
    0.5454545454545454
2072
    >>> sim_compression('Niall', 'Neil', compressor='bz2')
2073
    0.962962962962963
2074
    >>> sim_compression('Niall', 'Neil', compressor='lzma')
2075
    0.84
2076
    >>> sim_compression('Niall', 'Neil', compressor='arith')
2077
    0.3125
2078
    >>> sim_compression('Niall', 'Neil', compressor='rle')
2079
    0.0
2080
    >>> sim_compression('Niall', 'Neil', compressor='bwtrle')
2081
    0.16666666666666663
2082
    """
2083
    return 1 - dist_compression(src, tar, compressor, probs)
2084
2085
2086
def sim_monge_elkan(src, tar, sim_func=sim_levenshtein, symmetric=False):
2087
    """Return the Monge-Elkan similarity of two strings.
2088
2089
    Monge-Elkan is defined in :cite:`Monge:1996`.
2090
2091
    Note: Monge-Elkan is NOT a symmetric similarity algoritm. Thus, the
2092
    similarity of src to tar is not necessarily equal to the similarity of
2093
    tar to src. If the sym argument is True, a symmetric value is calculated,
2094
    at the cost of doubling the computation time (since the
2095
    :math:`sim_{Monge-Elkan}(src, tar)` and
2096
    :math:`sim_{Monge-Elkan}(tar, src)` are both calculated and then averaged).
2097
2098
    :param str src, tar: two strings to be compared
2099
    :param function sim_func: the internal similarity metric to emply
2100
    :param bool symmetric: return a symmetric similarity measure
2101
    :returns: Monge-Elkan similarity
2102
    :rtype: float
2103
2104
    >>> sim_monge_elkan('cat', 'hat')
2105
    0.75
2106
    >>> round(sim_monge_elkan('Niall', 'Neil'), 12)
2107
    0.666666666667
2108
    >>> round(sim_monge_elkan('aluminum', 'Catalan'), 12)
2109
    0.388888888889
2110
    >>> sim_monge_elkan('ATCG', 'TAGC')
2111
    0.5
2112
    """
2113
    if src == tar:
2114
        return 1.0
2115
2116
    q_src = sorted(QGrams(src).elements())
2117
    q_tar = sorted(QGrams(tar).elements())
2118
2119
    if not q_src or not q_tar:
2120
        return 0.0
2121
2122
    sum_of_maxes = 0
2123
    for q_s in q_src:
2124
        max_sim = float('-inf')
2125
        for q_t in q_tar:
2126
            max_sim = max(max_sim, sim_func(q_s, q_t))
2127
        sum_of_maxes += max_sim
2128
    sim_em = sum_of_maxes / len(q_src)
2129
2130
    if symmetric:
2131
        sim_em = (sim_em + sim_monge_elkan(tar, src, sim, False))/2
2132
2133
    return sim_em
2134
2135
2136
def dist_monge_elkan(src, tar, sim_func=sim_levenshtein, symmetric=False):
2137
    """Return the Monge-Elkan distance between two strings.
2138
2139
    Monge-Elkan distance is the complement of Monge-Elkan similarity:
2140
    :math:`dist_{Monge-Elkan} = 1 - sim_{Monge-Elkan}`.
2141
2142
    :param str src, tar: two strings to be compared
2143
    :param function sim_func: the internal similarity metric to emply
2144
    :param bool symmetric: return a symmetric similarity measure
2145
    :returns: Monge-Elkan distance
2146
    :rtype: float
2147
2148
    >>> dist_monge_elkan('cat', 'hat')
2149
    0.25
2150
    >>> round(dist_monge_elkan('Niall', 'Neil'), 12)
2151
    0.333333333333
2152
    >>> round(dist_monge_elkan('aluminum', 'Catalan'), 12)
2153
    0.611111111111
2154
    >>> dist_monge_elkan('ATCG', 'TAGC')
2155
    0.5
2156
    """
2157
    return 1 - sim_monge_elkan(src, tar, sim_func, symmetric)
2158
2159
2160
def sim_ident(src, tar):
2161
    """Return the identity similarity of two strings.
2162
2163
    Identity similarity is 1 if the two strings are identical, otherwise 0.
2164
2165
    :param str src, tar: two strings to be compared
2166
    :returns: identity similarity
2167
    :rtype: int
2168
2169
    >>> sim_ident('cat', 'hat')
2170
    0
2171
    >>> sim_ident('cat', 'cat')
2172
    1
2173
    """
2174
    return int(src == tar)
2175
2176
2177
def dist_ident(src, tar):
2178
    """Return the identity distance between two strings.
2179
2180
    This is 0 if the two strings are identical, otherwise 1, i.e.
2181
    :math:`dist_{identity} = 1 - sim_{identity}`.
2182
2183
    :param str src, tar: two strings to be compared
2184
    :returns: indentity distance
2185
    :rtype: int
2186
2187
    >>> dist_ident('cat', 'hat')
2188
    1
2189
    >>> dist_ident('cat', 'cat')
2190
    0
2191
    """
2192
    return 1 - sim_ident(src, tar)
2193
2194
2195
def sim_matrix(src, tar, mat=None, mismatch_cost=0, match_cost=1,
2196
               symmetric=True, alphabet=None):
2197
    """Return the matrix similarity of two strings.
2198
2199
    With the default parameters, this is identical to sim_ident.
2200
    It is possible for sim_matrix to return values outside of the range
2201
    :math:`[0, 1]`, if values outside that range are present in mat,
2202
    mismatch_cost, or match_cost.
2203
2204
    :param str src, tar: two strings to be compared
2205
    :param dict mat: a dict mapping tuples to costs; the tuples are (src, tar)
2206
        pairs of symbols from the alphabet parameter
2207
    :param float mismatch_cost: the value returned if (src, tar) is absent from
2208
        mat when src does not equal tar
2209
    :param float match_cost: the value returned if (src, tar) is absent from
2210
        mat when src equals tar
2211
    :param bool symmetric: True if the cost of src not matching tar is
2212
        identical to the cost of tar not matching src; in this case, the values
2213
        in mat need only contain (src, tar) or (tar, src), not both
2214
    :param str alphabet: a collection of tokens from which src and tar are
2215
        drawn; if this is defined a ValueError is raised if either tar or src
2216
        is not found in alphabet
2217
    :returns: matrix similarity
2218
    :rtype: float
2219
2220
    >>> sim_matrix('cat', 'hat')
2221
    0
2222
    >>> sim_matrix('hat', 'hat')
2223
    1
2224
    """
2225
    if alphabet:
2226
        alphabet = tuple(alphabet)
2227
        for i in src:
2228
            if i not in alphabet:
2229
                raise ValueError('src value not in alphabet')
2230
        for i in tar:
2231
            if i not in alphabet:
2232
                raise ValueError('tar value not in alphabet')
2233
2234
    if src == tar:
2235
        if mat and (src, src) in mat:
2236
            return mat[(src, src)]
2237
        return match_cost
2238
    if mat and (src, tar) in mat:
2239
        return mat[(src, tar)]
2240
    elif symmetric and mat and (tar, src) in mat:
2241
        return mat[(tar, src)]
2242
    return mismatch_cost
2243
2244
2245 View Code Duplication
def needleman_wunsch(src, tar, gap_cost=1, sim_func=sim_ident):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
2246
    """Return the Needleman-Wunsch score of two strings.
2247
2248
    The Needleman-Wunsch score :cite:`Needleman:1970` is a standard edit
2249
    distance measure.
2250
2251
    :param str src, tar: two strings to be compared
2252
    :param float gap_cost: the cost of an alignment gap (1 by default)
2253
    :param function sim_func: a function that returns the similarity of two
2254
        characters (identity similarity by default)
2255
    :returns: Needleman-Wunsch score
2256
    :rtype: int (in fact dependent on the gap_cost & return value of sim_func)
2257
2258
    >>> needleman_wunsch('cat', 'hat')
2259
    2.0
2260
    >>> needleman_wunsch('Niall', 'Neil')
2261
    1.0
2262
    >>> needleman_wunsch('aluminum', 'Catalan')
2263
    -1.0
2264
    >>> needleman_wunsch('ATCG', 'TAGC')
2265
    0.0
2266
    """
2267
    # pylint: disable=no-member
2268
    d_mat = np_zeros((len(src)+1, len(tar)+1), dtype=np_float32)
2269
    # pylint: enable=no-member
2270
2271
    for i in range(len(src)+1):
2272
        d_mat[i, 0] = -(i * gap_cost)
2273
    for j in range(len(tar)+1):
2274
        d_mat[0, j] = -(j * gap_cost)
2275
    for i in range(1, len(src)+1):
2276
        for j in range(1, len(tar)+1):
2277
            match = d_mat[i-1, j-1] + sim_func(src[i-1], tar[j-1])
2278
            delete = d_mat[i-1, j] - gap_cost
2279
            insert = d_mat[i, j-1] - gap_cost
2280
            d_mat[i, j] = max(match, delete, insert)
2281
    return d_mat[d_mat.shape[0]-1, d_mat.shape[1]-1]
2282
2283
2284 View Code Duplication
def smith_waterman(src, tar, gap_cost=1, sim_func=sim_ident):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
2285
    """Return the Smith-Waterman score of two strings.
2286
2287
    The Smith-Waterman score :cite:`Smith:1981` is a standard edit distance
2288
    measure, differing from Needleman-Wunsch in that it focuses on local
2289
    alignment and disallows negative scores.
2290
2291
    :param str src, tar: two strings to be compared
2292
    :param float gap_cost: the cost of an alignment gap (1 by default)
2293
    :param function sim_func: a function that returns the similarity of two
2294
        characters (identity similarity by default)
2295
    :returns: Smith-Waterman score
2296
    :rtype: int (in fact dependent on the gap_cost & return value of sim_func)
2297
2298
    >>> smith_waterman('cat', 'hat')
2299
    2.0
2300
    >>> smith_waterman('Niall', 'Neil')
2301
    1.0
2302
    >>> smith_waterman('aluminum', 'Catalan')
2303
    0.0
2304
    >>> smith_waterman('ATCG', 'TAGC')
2305
    1.0
2306
    """
2307
    # pylint: disable=no-member
2308
    d_mat = np_zeros((len(src)+1, len(tar)+1), dtype=np_float32)
2309
    # pylint: enable=no-member
2310
2311
    for i in range(len(src)+1):
2312
        d_mat[i, 0] = 0
2313
    for j in range(len(tar)+1):
2314
        d_mat[0, j] = 0
2315
    for i in range(1, len(src)+1):
2316
        for j in range(1, len(tar)+1):
2317
            match = d_mat[i-1, j-1] + sim_func(src[i-1], tar[j-1])
2318
            delete = d_mat[i-1, j] - gap_cost
2319
            insert = d_mat[i, j-1] - gap_cost
2320
            d_mat[i, j] = max(0, match, delete, insert)
2321
    return d_mat[d_mat.shape[0]-1, d_mat.shape[1]-1]
2322
2323
2324
def gotoh(src, tar, gap_open=1, gap_ext=0.4, sim_func=sim_ident):
2325
    """Return the Gotoh score of two strings.
2326
2327
    The Gotoh score :cite:`Gotoh:1982` is essentially Needleman-Wunsch with
2328
    affine gap penalties.
2329
2330
    :param str src, tar: two strings to be compared
2331
    :param float gap_open: the cost of an open alignment gap (1 by default)
2332
    :param float gap_ext: the cost of an alignment gap extension (0.4 by
2333
        default)
2334
    :param function sim_func: a function that returns the similarity of two
2335
        characters (identity similarity by default)
2336
    :returns: Gotoh score
2337
    :rtype: float (in fact dependent on the gap_cost & return value of
2338
        sim_func)
2339
2340
    >>> gotoh('cat', 'hat')
2341
    2.0
2342
    >>> gotoh('Niall', 'Neil')
2343
    1.0
2344
    >>> round(gotoh('aluminum', 'Catalan'), 12)
2345
    -0.4
2346
    >>> gotoh('cat', 'hat')
2347
    2.0
2348
    """
2349
    # pylint: disable=no-member
2350
    d_mat = np_zeros((len(src)+1, len(tar)+1), dtype=np_float32)
2351
    p_mat = np_zeros((len(src)+1, len(tar)+1), dtype=np_float32)
2352
    q_mat = np_zeros((len(src)+1, len(tar)+1), dtype=np_float32)
2353
    # pylint: enable=no-member
2354
2355
    d_mat[0, 0] = 0
2356
    p_mat[0, 0] = float('-inf')
2357
    q_mat[0, 0] = float('-inf')
2358
    for i in range(1, len(src)+1):
2359
        d_mat[i, 0] = float('-inf')
2360
        p_mat[i, 0] = -gap_open - gap_ext*(i-1)
2361
        q_mat[i, 0] = float('-inf')
2362
        q_mat[i, 1] = -gap_open
2363
    for j in range(1, len(tar)+1):
2364
        d_mat[0, j] = float('-inf')
2365
        p_mat[0, j] = float('-inf')
2366
        p_mat[1, j] = -gap_open
2367
        q_mat[0, j] = -gap_open - gap_ext*(j-1)
2368
2369
    for i in range(1, len(src)+1):
2370
        for j in range(1, len(tar)+1):
2371
            sim_val = sim_func(src[i-1], tar[j-1])
2372
            d_mat[i, j] = max(d_mat[i-1, j-1] + sim_val,
2373
                              p_mat[i-1, j-1] + sim_val,
2374
                              q_mat[i-1, j-1] + sim_val)
2375
2376
            p_mat[i, j] = max(d_mat[i-1, j] - gap_open,
2377
                              p_mat[i-1, j] - gap_ext)
2378
2379
            q_mat[i, j] = max(d_mat[i, j-1] - gap_open,
2380
                              q_mat[i, j-1] - gap_ext)
2381
2382
    i, j = (n - 1 for n in d_mat.shape)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable n does not seem to be defined.
Loading history...
2383
    return max(d_mat[i, j], p_mat[i, j], q_mat[i, j])
2384
2385
2386
def sim_length(src, tar):
2387
    """Return the length similarty of two strings.
2388
2389
    Length similarity is the ratio of the length of the shorter string to the
2390
    longer.
2391
2392
    :param str src, tar: two strings to be compared
2393
    :returns: length similarity
2394
    :rtype: float
2395
2396
    >>> sim_length('cat', 'hat')
2397
    1.0
2398
    >>> sim_length('Niall', 'Neil')
2399
    0.8
2400
    >>> sim_length('aluminum', 'Catalan')
2401
    0.875
2402
    >>> sim_length('ATCG', 'TAGC')
2403
    1.0
2404
    """
2405
    if src == tar:
2406
        return 1.0
2407
    if not src or not tar:
2408
        return 0.0
2409
    return len(src)/len(tar) if len(src) < len(tar) else len(tar)/len(src)
2410
2411
2412
def dist_length(src, tar):
2413
    """Return the length distance between two strings.
2414
2415
    Length distance is the complement of length similarity:
2416
    :math:`dist_{length} = 1 - sim_{length}`.
2417
2418
    :param str src, tar: two strings to be compared
2419
    :returns: length distance
2420
    :rtype: float
2421
2422
    >>> dist_length('cat', 'hat')
2423
    0.0
2424
    >>> dist_length('Niall', 'Neil')
2425
    0.19999999999999996
2426
    >>> dist_length('aluminum', 'Catalan')
2427
    0.125
2428
    >>> dist_length('ATCG', 'TAGC')
2429
    0.0
2430
    """
2431
    return 1 - sim_length(src, tar)
2432
2433
2434 View Code Duplication
def sim_prefix(src, tar):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
2435
    """Return the prefix similarty of two strings.
2436
2437
    Prefix similarity is the ratio of the length of the shorter term that
2438
    exactly matches the longer term to the length of the shorter term,
2439
    beginning at the start of both terms.
2440
2441
    :param str src, tar: two strings to be compared
2442
    :returns: prefix similarity
2443
    :rtype: float
2444
2445
    >>> sim_prefix('cat', 'hat')
2446
    0.0
2447
    >>> sim_prefix('Niall', 'Neil')
2448
    0.25
2449
    >>> sim_prefix('aluminum', 'Catalan')
2450
    0.0
2451
    >>> sim_prefix('ATCG', 'TAGC')
2452
    0.0
2453
    """
2454
    if src == tar:
2455
        return 1.0
2456
    if not src or not tar:
2457
        return 0.0
2458
    min_word, max_word = (src, tar) if len(src) < len(tar) else (tar, src)
2459
    min_len = len(min_word)
2460
    for i in range(min_len, 0, -1):
2461
        if min_word[:i] == max_word[:i]:
2462
            return i/min_len
2463
    return 0.0
2464
2465
2466
def dist_prefix(src, tar):
2467
    """Return the prefix distance between two strings.
2468
2469
    Prefix distance is the complement of prefix similarity:
2470
    :math:`dist_{prefix} = 1 - sim_{prefix}`.
2471
2472
    :param str src, tar: two strings to be compared
2473
    :returns: prefix distance
2474
    :rtype: float
2475
2476
    >>> dist_prefix('cat', 'hat')
2477
    1.0
2478
    >>> dist_prefix('Niall', 'Neil')
2479
    0.75
2480
    >>> dist_prefix('aluminum', 'Catalan')
2481
    1.0
2482
    >>> dist_prefix('ATCG', 'TAGC')
2483
    1.0
2484
    """
2485
    return 1 - sim_prefix(src, tar)
2486
2487
2488 View Code Duplication
def sim_suffix(src, tar):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
2489
    """Return the suffix similarity of two strings.
2490
2491
    Suffix similarity is the ratio of the length of the shorter term that
2492
    exactly matches the longer term to the length of the shorter term,
2493
    beginning at the end of both terms.
2494
2495
    :param str src, tar: two strings to be compared
2496
    :returns: suffix similarity
2497
    :rtype: float
2498
2499
    >>> sim_suffix('cat', 'hat')
2500
    0.6666666666666666
2501
    >>> sim_suffix('Niall', 'Neil')
2502
    0.25
2503
    >>> sim_suffix('aluminum', 'Catalan')
2504
    0.0
2505
    >>> sim_suffix('ATCG', 'TAGC')
2506
    0.0
2507
    """
2508
    if src == tar:
2509
        return 1.0
2510
    if not src or not tar:
2511
        return 0.0
2512
    min_word, max_word = (src, tar) if len(src) < len(tar) else (tar, src)
2513
    min_len = len(min_word)
2514
    for i in range(min_len, 0, -1):
2515
        if min_word[-i:] == max_word[-i:]:
2516
            return i/min_len
2517
    return 0.0
2518
2519
2520
def dist_suffix(src, tar):
2521
    """Return the suffix distance between two strings.
2522
2523
    Suffix distance is the complement of suffix similarity:
2524
    :math:`dist_{suffix} = 1 - sim_{suffix}`.
2525
2526
    :param str src, tar: two strings to be compared
2527
    :returns: suffix distance
2528
    :rtype: float
2529
2530
    >>> dist_suffix('cat', 'hat')
2531
    0.33333333333333337
2532
    >>> dist_suffix('Niall', 'Neil')
2533
    0.75
2534
    >>> dist_suffix('aluminum', 'Catalan')
2535
    1.0
2536
    >>> dist_suffix('ATCG', 'TAGC')
2537
    1.0
2538
    """
2539
    return 1 - sim_suffix(src, tar)
2540
2541
2542
def sim_mlipns(src, tar, threshold=0.25, maxmismatches=2):
2543
    """Return the MLIPNS similarity of two strings.
2544
2545
    Modified Language-Independent Product Name Search (MLIPNS) is described in
2546
    :cite:`Shannaq:2010`. This function returns only 1.0 (similar) or 0.0
2547
    (not similar). LIPNS similarity is identical to normalized Hamming
2548
    similarity.
2549
2550
    :param str src, tar: two strings to be compared
2551
    :param float threshold: a number [0, 1] indicating the maximum similarity
2552
        score, below which the strings are considered 'similar' (0.25 by
2553
        default)
2554
    :param int maxmismatches: a number indicating the allowable number of
2555
        mismatches to remove before declaring two strings not similar (2 by
2556
        default)
2557
    :returns: MLIPNS similarity
2558
    :rtype: float
2559
2560
    >>> sim_mlipns('cat', 'hat')
2561
    1.0
2562
    >>> sim_mlipns('Niall', 'Neil')
2563
    0.0
2564
    >>> sim_mlipns('aluminum', 'Catalan')
2565
    0.0
2566
    >>> sim_mlipns('ATCG', 'TAGC')
2567
    0.0
2568
    """
2569
    if tar == src:
2570
        return 1.0
2571
    if not src or not tar:
2572
        return 0.0
2573
2574
    mismatches = 0
2575
    ham = hamming(src, tar, difflens=True)
2576
    maxlen = max(len(src), len(tar))
2577
    while src and tar and mismatches <= maxmismatches:
2578
        if maxlen < 1 or (1-(maxlen-ham)/maxlen) <= threshold:
2579
            return 1.0
2580
        else:
2581
            mismatches += 1
2582
            ham -= 1
2583
            maxlen -= 1
2584
2585
    if maxlen < 1:
2586
        return 1.0
2587
    return 0.0
2588
2589
2590
def dist_mlipns(src, tar, threshold=0.25, maxmismatches=2):
2591
    """Return the MLIPNS distance between two strings.
2592
2593
    MLIPNS distance is the complement of MLIPNS similarity:
2594
    :math:`dist_{MLIPNS} = 1 - sim_{MLIPNS}`. This function returns only 0.0
2595
    (distant) or 1.0 (not distant).
2596
2597
    :param str src, tar: two strings to be compared
2598
    :param float threshold: a number [0, 1] indicating the maximum similarity
2599
        score, below which the strings are considered 'similar' (0.25 by
2600
        default)
2601
    :param int maxmismatches: a number indicating the allowable number of
2602
        mismatches to remove before declaring two strings not similar (2 by
2603
        default)
2604
    :returns: MLIPNS distance
2605
    :rtype: float
2606
2607
    >>> dist_mlipns('cat', 'hat')
2608
    0.0
2609
    >>> dist_mlipns('Niall', 'Neil')
2610
    1.0
2611
    >>> dist_mlipns('aluminum', 'Catalan')
2612
    1.0
2613
    >>> dist_mlipns('ATCG', 'TAGC')
2614
    1.0
2615
    """
2616
    return 1.0 - sim_mlipns(src, tar, threshold, maxmismatches)
2617
2618
2619
def bag(src, tar):
2620
    """Return the bag distance between two strings.
2621
2622
    Bag distance is proposed in :cite:`Bartolini:2002`. It is defined as:
2623
    :math:`max(|multiset(src)-multiset(tar)|, |multiset(tar)-multiset(src)|)`.
2624
2625
    :param str src, tar: two strings to be compared
2626
    :returns: bag distance
2627
    :rtype: int
2628
2629
    >>> bag('cat', 'hat')
2630
    1
2631
    >>> bag('Niall', 'Neil')
2632
    2
2633
    >>> bag('aluminum', 'Catalan')
2634
    5
2635
    >>> bag('ATCG', 'TAGC')
2636
    0
2637
    >>> bag('abcdefg', 'hijklm')
2638
    7
2639
    >>> bag('abcdefg', 'hijklmno')
2640
    8
2641
    """
2642
    if tar == src:
2643
        return 0
2644
    elif not src:
2645
        return len(tar)
2646
    elif not tar:
2647
        return len(src)
2648
2649
    src_bag = Counter(src)
2650
    tar_bag = Counter(tar)
2651
    return max(sum((src_bag-tar_bag).values()),
2652
               sum((tar_bag-src_bag).values()))
2653
2654
2655
def dist_bag(src, tar):
2656
    """Return the normalized bag distance between two strings.
2657
2658
    Bag distance is normalized by dividing by :math:`max( |src|, |tar| )`.
2659
2660
    :param str src, tar: two strings to be compared
2661
    :returns: normalized bag distance
2662
    :rtype: float
2663
2664
    >>> dist_bag('cat', 'hat')
2665
    0.3333333333333333
2666
    >>> dist_bag('Niall', 'Neil')
2667
    0.4
2668
    >>> dist_bag('aluminum', 'Catalan')
2669
    0.625
2670
    >>> dist_bag('ATCG', 'TAGC')
2671
    0.0
2672
    """
2673
    if tar == src:
2674
        return 0.0
2675
    if not src or not tar:
2676
        return 1.0
2677
2678
    maxlen = max(len(src), len(tar))
2679
2680
    return bag(src, tar)/maxlen
2681
2682
2683
def sim_bag(src, tar):
2684
    """Return the normalized bag similarity of two strings.
2685
2686
    Normalized bag similarity is the complement of normalized bag distance:
2687
    :math:`sim_{bag} = 1 - dist_{bag}`.
2688
2689
    :param str src, tar: two strings to be compared
2690
    :returns: normalized bag similarity
2691
    :rtype: float
2692
2693
    >>> round(sim_bag('cat', 'hat'), 12)
2694
    0.666666666667
2695
    >>> sim_bag('Niall', 'Neil')
2696
    0.6
2697
    >>> sim_bag('aluminum', 'Catalan')
2698
    0.375
2699
    >>> sim_bag('ATCG', 'TAGC')
2700
    1.0
2701
    """
2702
    return 1-dist_bag(src, tar)
2703
2704
2705
def editex(src, tar, cost=(0, 1, 2), local=False):
2706
    """Return the Editex distance between two strings.
2707
2708
    As described on pages 3 & 4 of :cite:`Zobel:1996`.
2709
2710
    The local variant is based on :cite:`Ring:2009`.
2711
2712
    :param str src, tar: two strings to be compared
2713
    :param tuple cost: a 3-tuple representing the cost of the four possible
2714
        edits:
2715
        match, same-group, and mismatch respectively (by default: (0, 1, 2))
2716
    :param bool local: if True, the local variant of Editex is used
2717
    :returns: Editex distance
2718
    :rtype: int
2719
2720
    >>> editex('cat', 'hat')
2721
    2
2722
    >>> editex('Niall', 'Neil')
2723
    2
2724
    >>> editex('aluminum', 'Catalan')
2725
    12
2726
    >>> editex('ATCG', 'TAGC')
2727
    6
2728
    """
2729
    match_cost, group_cost, mismatch_cost = cost
2730
    letter_groups = ({'A', 'E', 'I', 'O', 'U', 'Y'},
2731
                     {'B', 'P'},
2732
                     {'C', 'K', 'Q'},
2733
                     {'D', 'T'},
2734
                     {'L', 'R'},
2735
                     {'M', 'N'},
2736
                     {'G', 'J'},
2737
                     {'F', 'P', 'V'},
2738
                     {'S', 'X', 'Z'},
2739
                     {'C', 'S', 'Z'})
2740
    all_letters = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'J', 'K', 'L', 'M',
2741
                   'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'X', 'Y', 'Z'}
2742
2743
    def r_cost(ch1, ch2):
2744
        """Return r(a,b) according to Zobel & Dart's definition."""
2745
        if ch1 == ch2:
2746
            return match_cost
2747
        if ch1 in all_letters and ch2 in all_letters:
2748
            for group in letter_groups:
2749
                if ch1 in group and ch2 in group:
2750
                    return group_cost
2751
        return mismatch_cost
2752
2753
    def d_cost(ch1, ch2):
2754
        """Return d(a,b) according to Zobel & Dart's definition."""
2755
        if ch1 != ch2 and (ch1 == 'H' or ch1 == 'W'):
2756
            return group_cost
2757
        return r_cost(ch1, ch2)
2758
2759
    # convert both src & tar to NFKD normalized unicode
2760
    src = normalize('NFKD', text_type(src.upper()))
2761
    tar = normalize('NFKD', text_type(tar.upper()))
2762
    # convert ß to SS (for Python2)
2763
    src = src.replace('ß', 'SS')
2764
    tar = tar.replace('ß', 'SS')
2765
2766
    if src == tar:
2767
        return 0
2768
    if not src:
2769
        return len(tar) * mismatch_cost
2770
    if not tar:
2771
        return len(src) * mismatch_cost
2772
2773
    # pylint: disable=no-member
2774
    d_mat = np_zeros((len(src)+1, len(tar)+1), dtype=np_int)
2775
    # pylint: enable=no-member
2776
    lens = len(src)
2777
    lent = len(tar)
2778
    src = ' '+src
2779
    tar = ' '+tar
2780
2781
    if not local:
2782
        for i in range(1, lens+1):
2783
            d_mat[i, 0] = d_mat[i-1, 0] + d_cost(src[i-1], src[i])
2784
    for j in range(1, lent+1):
2785
        d_mat[0, j] = d_mat[0, j-1] + d_cost(tar[j-1], tar[j])
2786
2787
    for i in range(1, lens+1):
2788
        for j in range(1, lent+1):
2789
            d_mat[i, j] = min(d_mat[i-1, j] + d_cost(src[i-1], src[i]),
2790
                              d_mat[i, j-1] + d_cost(tar[j-1], tar[j]),
2791
                              d_mat[i-1, j-1] + r_cost(src[i], tar[j]))
2792
2793
    return d_mat[lens, lent]
2794
2795
2796
def dist_editex(src, tar, cost=(0, 1, 2), local=False):
2797
    """Return the normalized Editex distance between two strings.
2798
2799
    The Editex distance is normalized by dividing the Editex distance
2800
    (calculated by any of the three supported methods) by the greater of
2801
    the number of characters in src times the cost of a delete and
2802
    the number of characters in tar times the cost of an insert.
2803
    For the case in which all operations have :math:`cost = 1`, this is
2804
    equivalent to the greater of the length of the two strings src & tar.
2805
2806
    :param str src, tar: two strings to be compared
2807
    :param tuple cost: a 3-tuple representing the cost of the four possible
2808
        edits:
2809
        match, same-group, and mismatch respectively (by default: (0, 1, 2))
2810
    :param bool local: if True, the local variant of Editex is used
2811
    :returns: normalized Editex distance
2812
    :rtype: float
2813
2814
    >>> round(dist_editex('cat', 'hat'), 12)
2815
    0.333333333333
2816
    >>> round(dist_editex('Niall', 'Neil'), 12)
2817
    0.2
2818
    >>> dist_editex('aluminum', 'Catalan')
2819
    0.75
2820
    >>> dist_editex('ATCG', 'TAGC')
2821
    0.75
2822
    """
2823
    if src == tar:
2824
        return 0
2825
    mismatch_cost = cost[2]
2826
    return (editex(src, tar, cost, local) /
2827
            (max(len(src)*mismatch_cost, len(tar)*mismatch_cost)))
2828
2829
2830
def sim_editex(src, tar, cost=(0, 1, 2), local=False):
2831
    """Return the normalized Editex similarity of two strings.
2832
2833
    The Editex similarity is the complement of Editex distance:
2834
    :math:`sim_{Editex} = 1 - dist_{Editex}`.
2835
2836
    :param str src, tar: two strings to be compared
2837
    :param tuple cost: a 3-tuple representing the cost of the four possible
2838
        edits:
2839
        match, same-group, and mismatch respectively (by default: (0, 1, 2))
2840
    :param bool local: if True, the local variant of Editex is used
2841
    :returns: normalized Editex similarity
2842
    :rtype: float
2843
2844
    >>> round(sim_editex('cat', 'hat'), 12)
2845
    0.666666666667
2846
    >>> round(sim_editex('Niall', 'Neil'), 12)
2847
    0.8
2848
    >>> sim_editex('aluminum', 'Catalan')
2849
    0.25
2850
    >>> sim_editex('ATCG', 'TAGC')
2851
    0.25
2852
    """
2853
    return 1 - dist_editex(src, tar, cost, local)
2854
2855
2856
def eudex_hamming(src, tar, weights='exponential', maxlength=8,
2857
                  normalized=False):
2858
    """Calculate the Hamming distance between the Eudex hashes of two terms.
2859
2860
    Cf. :cite:`Ticki:2016`.
2861
2862
    - If weights is set to None, a simple Hamming distance is calculated.
2863
    - If weights is set to 'exponential', weight decays by powers of 2, as
2864
      proposed in the eudex specification: https://github.com/ticki/eudex.
2865
    - If weights is set to 'fibonacci', weight decays through the Fibonacci
2866
      series, as in the eudex reference implementation.
2867
    - If weights is set to a callable function, this assumes it creates a
2868
      generator and the generator is used to populate a series of weights.
2869
    - If weights is set to an iterable, the iterable's values should be
2870
      integers and will be used as the weights.
2871
2872
    :param str src, tar: two strings to be compared
2873
    :param iterable or generator function weights:
2874
    :param maxlength: the number of characters to encode as a eudex hash
2875
    :return:
2876
    """
2877
    def _gen_fibonacci():
2878
        """Yield the next Fibonacci number.
2879
2880
        Based on https://www.python-course.eu/generators.php
2881
        Starts at Fibonacci number 3 (the second 1)
2882
        """
2883
        num_a, num_b = 1, 2
2884
        while True:
2885
            yield num_a
2886
            num_a, num_b = num_b, num_a + num_b
2887
2888
    def _gen_exponential(base=2):
2889
        """Yield the next value in an exponential series of the base.
2890
2891
        Starts at base**0
2892
        """
2893
        exp = 0
2894
        while True:
2895
            yield base ** exp
2896
            exp += 1
2897
2898
    # Calculate the eudex hashes and XOR them
2899
    xored = eudex(src, maxlength=maxlength) ^ eudex(tar, maxlength=maxlength)
2900
2901
    # Simple hamming distance (all bits are equal)
2902
    if not weights:
2903
        return bin(xored).count('1')
2904
2905
    # If weights is a function, it should create a generator,
2906
    # which we now use to populate a list
2907
    if callable(weights):
2908
        weights = weights()
2909
    elif weights == 'exponential':
2910
        weights = _gen_exponential()
2911
    elif weights == 'fibonacci':
2912
        weights = _gen_fibonacci()
2913
    if isinstance(weights, GeneratorType):
2914
        weights = [next(weights) for _ in range(maxlength)][::-1]
2915
2916
    # Sum the weighted hamming distance
2917
    dist = 0
2918
    maxdist = 0
2919
    while (xored or normalized) and weights:
2920
        maxdist += 8*weights[-1]
2921
        dist += bin(xored & 0xFF).count('1') * weights.pop()
2922
        xored >>= 8
2923
2924
    if normalized:
2925
        dist /= maxdist
2926
2927
    return dist
2928
2929
2930
def dist_eudex(src, tar, weights='exponential', maxlength=8):
2931
    """Return normalized Hamming distance between Eudex hashes of two terms.
2932
2933
    This is Eudex distance normalized to [0, 1].
2934
2935
    :param str src, tar: two strings to be compared
2936
    :param iterable or generator function weights:
2937
    :param maxlength: the number of characters to encode as a eudex hash
2938
    :return:
2939
    """
2940
    return eudex_hamming(src, tar, weights, maxlength, True)
2941
2942
2943
def sim_eudex(src, tar, weights='exponential', maxlength=8):
2944
    """Return normalized Hamming similarity between Eudex hashes of two terms.
2945
2946
    Normalized Eudex similarity is the complement of normalized Eudex distance:
2947
    :math:`sim_{Eudex} = 1 - dist_{Eudex}`.
2948
2949
    :param str src, tar: two strings to be compared
2950
    :param iterable or generator function weights:
2951
    :param maxlength: the number of characters to encode as a eudex hash
2952
    :return:
2953
    """
2954
    return 1-dist_eudex(src, tar, weights, maxlength)
2955
2956
2957
def sift4_simplest(src, tar, max_offset=0):
2958
    """Return the "simplest" Sift4 distance between two terms.
2959
2960
    This is an approximation of edit distance, described in
2961
    :cite:`Zackwehdex:2014`.
2962
2963
    :param str src, tar: two strings to be compared
2964
    :param max_offset: the number of characters to search for matching letters
2965
    :return:
2966
    """
2967
    if not src:
2968
        return len(tar)
2969
2970
    if not tar:
2971
        return len(src)
2972
2973
    src_len = len(src)
2974
    tar_len = len(tar)
2975
2976
    src_cur = 0
2977
    tar_cur = 0
2978
    lcss = 0
2979
    local_cs = 0
2980
2981
    while (src_cur < src_len) and (tar_cur < tar_len):
2982
        if src[src_cur] == tar[tar_cur]:
2983
            local_cs += 1
2984
        else:
2985
            lcss += local_cs
2986
            local_cs = 0
2987
            if src_cur != tar_cur:
2988
                src_cur = tar_cur = max(src_cur, tar_cur)
2989
            for i in range(max_offset):
2990
                if not ((src_cur+i < src_len) or (tar_cur+i < tar_len)):
2991
                    break
2992
                if (src_cur+i < src_len) and (src[src_cur+i] == tar[tar_cur]):
2993
                    src_cur += i
2994
                    local_cs += 1
2995
                    break
2996
                if (tar_cur+i < tar_len) and (src[src_cur] == tar[tar_cur+i]):
2997
                    tar_cur += i
2998
                    local_cs += 1
2999
                    break
3000
3001
        src_cur += 1
3002
        tar_cur += 1
3003
3004
    lcss += local_cs
3005
    return round(max(src_len, tar_len) - lcss)
3006
3007
3008
def sift4_common(src, tar, max_offset=0, max_distance=0):
3009
    """Return the "common" Sift4 distance between two terms.
3010
3011
    This is an approximation of edit distance, described in
3012
    :cite:`Zackwehdex:2014`.
3013
3014
    :param str src, tar: two strings to be compared
3015
    :param max_offset: the number of characters to search for matching letters
3016
    :param max_distance: the distance at which to stop and exit
3017
    :return:
3018
    """
3019
    if not src:
3020
        return len(tar)
3021
3022
    if not tar:
3023
        return len(src)
3024
3025
    src_len = len(src)
3026
    tar_len = len(tar)
3027
3028
    src_cur = 0
3029
    tar_cur = 0
3030
    lcss = 0
3031
    local_cs = 0
3032
    trans = 0
3033
    offset_arr = []
3034
3035
    while (src_cur < src_len) and (tar_cur < tar_len):
3036
        if src[src_cur] == tar[tar_cur]:
3037
            local_cs += 1
3038
            is_trans = False
3039
            i = 0
3040
            while i < len(offset_arr):
3041
                ofs = offset_arr[i]
3042
                if src_cur <= ofs['src_cur'] or tar_cur <= ofs['tar_cur']:
3043
                    is_trans = (abs(tar_cur-src_cur) >=
3044
                                abs(ofs['tar_cur']-ofs['src_cur']))
3045
                    if is_trans:
3046
                        trans += 1
3047
                    elif not ofs['trans']:
3048
                        ofs['trans'] = True
3049
                        trans += 1
3050
                    break
3051
                elif src_cur > ofs['tar_cur'] and tar_cur > ofs['src_cur']:
3052
                    del offset_arr[i]
3053
                else:
3054
                    i += 1
3055
3056
            offset_arr.append({'src_cur': src_cur, 'tar_cur': tar_cur,
3057
                               'trans': is_trans})
3058
        else:
3059
            lcss += local_cs
3060
            local_cs = 0
3061
            if src_cur != tar_cur:
3062
                src_cur = tar_cur = min(src_cur, tar_cur)
3063
            for i in range(max_offset):
3064
                if not ((src_cur+i < src_len) or (tar_cur+i < tar_len)):
3065
                    break
3066
                if (src_cur+i < src_len) and (src[src_cur+i] == tar[tar_cur]):
3067
                    src_cur += i-1
3068
                    tar_cur -= 1
3069
                    break
3070
                if (tar_cur+i < tar_len) and (src[src_cur] == tar[tar_cur+i]):
3071
                    src_cur -= 1
3072
                    tar_cur += i-1
3073
                    break
3074
3075
        src_cur += 1
3076
        tar_cur += 1
3077
3078
        if max_distance:
3079
            temporary_distance = max(src_cur, tar_cur) - lcss + trans
3080
            if temporary_distance >= max_distance:
3081
                return round(temporary_distance)
3082
3083
        if (src_cur >= src_len) or (tar_cur >= tar_len):
3084
            lcss += local_cs
3085
            local_cs = 0
3086
            src_cur = tar_cur = min(src_cur, tar_cur)
3087
3088
    lcss += local_cs
3089
    return round(max(src_len, tar_len) - lcss + trans)
3090
3091
3092
def dist_sift4(src, tar, max_offset=0, max_distance=0):
3093
    """Return the normalized "common" Sift4 distance between two terms.
3094
3095
    This is Sift4 distance, normalized to [0, 1].
3096
3097
    :param str src, tar: two strings to be compared
3098
    :param max_offset: the number of characters to search for matching letters
3099
    :param max_distance: the distance at which to stop and exit
3100
    :return:
3101
    """
3102
    return (sift4_common(src, tar, max_offset, max_distance) /
3103
            (max(len(src), len(tar))))
3104
3105
3106
def sim_sift4(src, tar, max_offset=0, max_distance=0):
3107
    """Return the normalized "common" Sift4 similarity of two terms.
3108
3109
    Normalized Sift4 similarity is the complement of normalized Sift4 distance:
3110
    :math:`sim_{Sift4} = 1 - dist_{Sift4}`.
3111
3112
    :param str src, tar: two strings to be compared
3113
    :param max_offset: the number of characters to search for matching letters
3114
    :param max_distance: the distance at which to stop and exit
3115
    :return:
3116
    """
3117
    return 1-dist_sift4(src, tar, max_offset, max_distance)
3118
3119
3120
def sim_baystat(src, tar, min_ss_len=None, left_ext=None, right_ext=None):
3121
    """Return the Baystat similarity.
3122
3123
    Good results for shorter words are reported when setting min_ss_len to 1
3124
    and either left_ext OR right_ext to 1.
3125
3126
    The Baystat similarity is defined in :cite:`Furnohr:2002`.
3127
3128
    This is ostensibly a port of the R module PPRL's implementation:
3129
    https://github.com/cran/PPRL/blob/master/src/MTB_Baystat.cpp
3130
    :cite:`Rukasz:2018`. As such, this could be made more pythonic.
3131
3132
    :param str src, tar: two strings to be compared
3133
    :param int min_ss_len: minimum substring length to be considered
3134
    :param int left_ext: left-side extension length
3135
    :param int right_ext: right-side extension length
3136
    :rtype: float
3137
    :return: the Baystat similarity
3138
    """
3139
    if src == tar:
3140
        return 1
3141
    if not src or not tar:
3142
        return 0
3143
3144
    max_len = max(len(src), len(tar))
3145
3146
    if not (min_ss_len and left_ext and right_ext):
3147
        # These can be set via arguments to the function. Otherwise they are
3148
        # set automatically based on values from the article.
3149
        if max_len >= 7:
3150
            min_ss_len = 2
3151
            left_ext = 2
3152
            right_ext = 2
3153
        else:
3154
            # The paper suggests that for short names, (exclusively) one or the
3155
            # other of left_ext and right_ext can be 1, with good results.
3156
            # I use 0 & 0 as the default in this case.
3157
            min_ss_len = 1
3158
            left_ext = 0
3159
            right_ext = 0
3160
3161
    pos = 0
3162
    match_len = 0
3163
3164
    while (True):
3165
        if pos + min_ss_len > len(src):
3166
            return match_len/max_len
3167
3168
        hit_len = 0
3169
        ix = 1
3170
3171
        substring = src[pos:pos + min_ss_len]
3172
        search_begin = pos - left_ext
3173
3174
        if search_begin < 0:
3175
            search_begin = 0
3176
            left_ext_len = pos
3177
        else:
3178
            left_ext_len = left_ext
3179
3180
        if pos + min_ss_len + right_ext >= len(tar):
3181
            right_ext_len = len(tar) - pos - min_ss_len
3182
        else:
3183
            right_ext_len = right_ext
3184
3185
        if (search_begin + left_ext_len + min_ss_len + right_ext_len >
3186
                search_begin):
3187
            search_val = tar[search_begin:(search_begin + left_ext_len +
3188
                                           min_ss_len + right_ext_len)]
3189
        else:
3190
            search_val = ''
3191
3192
        flagged_tar = ''
3193
        while substring in search_val and pos + ix <= len(src):
3194
            hit_len = len(substring)
3195
            flagged_tar = tar.replace(substring, '#'*hit_len)
3196
3197
            if pos + min_ss_len + ix <= len(src):
3198
                substring = src[pos:pos + min_ss_len + ix]
3199
3200
            if pos+min_ss_len + right_ext_len + 1 <= len(tar):
3201
                right_ext_len += 1
3202
3203
            if (search_begin + left_ext_len + min_ss_len + right_ext_len <=
3204
                    len(tar)):
3205
                search_val = tar[search_begin:(search_begin + left_ext_len +
3206
                                               min_ss_len + right_ext_len)]
3207
3208
            ix += 1
3209
3210
        if hit_len > 0:
3211
            tar = flagged_tar
3212
3213
        match_len += hit_len
3214
        pos += ix
3215
3216
3217
def dist_baystat(src, tar, min_ss_len=None, left_ext=None, right_ext=None):
3218
    """Return the Baystat distance.
3219
3220
    Normalized Baystat similarity is the complement of normalized Baystat
3221
    distance: :math:`sim_{Baystat} = 1 - dist_{Baystat}`.
3222
3223
    :param str src, tar: two strings to be compared
3224
    :param int min_ss_len: minimum substring length to be considered
3225
    :param int left_ext: left-side extension length
3226
    :param int right_ext: right-side extension length
3227
    :rtype: float
3228
    :return: the Baystat distance
3229
    """
3230
    return 1-sim_baystat(src, tar, min_ss_len, left_ext, right_ext)
3231
3232
3233
def typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5)):
3234
    """Return the typo distance between two strings.
3235
3236
    This is inspired by Typo-Distance :cite:`Song:2011`, and a fair bit of
3237
    this was copied from that module. Compared to the original, this supports
3238
    different metrics for substitution.
3239
3240
    :param str src, tar: two strings to be compared
3241
    :param str metric: supported values include: 'euclidean', 'manhattan',
3242
          'log-euclidean', and 'log-manhattan'
3243
    :param tuple cost: a 4-tuple representing the cost of the four possible
3244
        edits: inserts, deletes, substitutions, and shift, respectively (by
3245
        default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be
3246
        significantly less than the cost of an insertion & deletion unless
3247
        a log metric is used.
3248
    :return: typo distance
3249
    :rtype: float
3250
    """
3251
    ins_cost, del_cost, sub_cost, shift_cost = cost
3252
3253
    if src == tar:
3254
        return 0.0
3255
    if not src:
3256
        return len(tar) * ins_cost
3257
    if not tar:
3258
        return len(src) * del_cost
3259
3260
    layout = {'QWERTY': (
3261
        (('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '-', '='),
3262
         ('', 'q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '[', ']',
3263
          '\\'),
3264
         ('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', ';', '\''),
3265
         ('', 'z', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '/'),
3266
         ('', '', ' ', ' ', ' ', ' ', ' ', ' ', ' ')),
3267
        (('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+'),
3268
         ('', 'Q', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '{', '}', '|'),
3269
         ('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', ':', '"'),
3270
         ('', 'Z', 'X', 'C', 'V', 'B', 'N', 'M', '<', '>', '?'),
3271
         ('', '', ' ', ' ', ' ', ' ', ' ', ' ', ' '))
3272
    )}
3273
3274
    keyboard = layout['QWERTY']
3275
    lowercase = {item for sublist in keyboard[0] for item in sublist}
3276
    uppercase = {item for sublist in keyboard[1] for item in sublist}
3277
3278
    def _kb_array_for_char(char):
3279
        """Return the keyboard layout that contains ch."""
3280
        if char in lowercase:
3281
            return keyboard[0]
3282
        elif char in uppercase:
3283
            return keyboard[1]
3284
        else:
3285
            raise ValueError(char + ' not found in any keyboard layouts')
3286
3287
    def _get_char_coord(char, keyboard):
3288
        """Return the row & column of char in the keyboard."""
3289
        for row in keyboard:
3290
            if char in row:
3291
                return keyboard.index(row), row.index(char)
3292
        raise ValueError(char + ' not found in given keyboard layout')
3293
3294
    def _euclidean_keyboard_distance(char1, char2):
3295
        row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1))
3296
        row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2))
3297
        return ((row1 - row2) ** 2 + (col1 - col2) ** 2) ** 0.5
3298
3299
    def _manhattan_keyboard_distance(char1, char2):
3300
        row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1))
3301
        row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2))
3302
        return abs(row1 - row2) + abs(col1 - col2)
3303
3304
    def _log_euclidean_keyboard_distance(char1, char2):
3305
        return log(1 + _euclidean_keyboard_distance(char1, char2))
3306
3307
    def _log_manhattan_keyboard_distance(char1, char2):
3308
        return log(1 + _manhattan_keyboard_distance(char1, char2))
3309
3310
    metric_dict = {'euclidean': _euclidean_keyboard_distance,
3311
                   'manhattan': _manhattan_keyboard_distance,
3312
                   'log-euclidean': _log_euclidean_keyboard_distance,
3313
                   'log-manhattan': _log_manhattan_keyboard_distance}
3314
3315
    def substitution_cost(char1, char2):
3316
        cost = sub_cost
3317
        cost *= (metric_dict[metric](char1, char2) +
3318
                 shift_cost * (_kb_array_for_char(char1) !=
3319
                               _kb_array_for_char(char2)))
3320
        return cost
3321
3322
    d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32)
3323
    for i in range(len(src) + 1):
3324
        d_mat[i, 0] = i * del_cost
3325
    for j in range(len(tar) + 1):
3326
        d_mat[0, j] = j * ins_cost
3327
3328
    for i in range(len(src)):
3329
        for j in range(len(tar)):
3330
            d_mat[i + 1, j + 1] = min(
3331
                d_mat[i + 1, j] + ins_cost,  # ins
3332
                d_mat[i, j + 1] + del_cost,  # del
3333
                d_mat[i, j] + (substitution_cost(src[i], tar[j])
3334
                               if src[i] != tar[j] else 0)  # sub/==
3335
            )
3336
3337
    return d_mat[len(src), len(tar)]
3338
3339
3340
def dist_typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5)):
3341
    """Return the normalized typo distance between two strings.
3342
3343
    This is typo distance, normalized to [0, 1].
3344
3345
    :param str src, tar: two strings to be compared
3346
    :param str metric: supported values include: 'euclidean', 'manhattan',
3347
          'log-euclidean', and 'log-manhattan'
3348
    :param tuple cost: a 4-tuple representing the cost of the four possible
3349
        edits: inserts, deletes, substitutions, and shift, respectively (by
3350
        default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be
3351
        significantly less than the cost of an insertion & deletion unless
3352
        a log metric is used.
3353
    :return: normalized typo distance
3354
    :rtype: float
3355
    """
3356
    if src == tar:
3357
        return 0
3358
    ins_cost, del_cost = cost[:2]
3359
    return (typo(src, tar, metric, cost) /
3360
            (max(len(src)*del_cost, len(tar)*ins_cost)))
3361
3362
3363
def sim_typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5)):
3364
    """Return the normalized typo similarity between two strings.
3365
3366
    Normalized typo similarity is the complement of normalized typo distance:
3367
    :math:`sim_{typo} = 1 - dist_{typo}`.
3368
3369
    :param str src, tar: two strings to be compared
3370
    :param str metric: supported values include: 'euclidean', 'manhattan',
3371
          'log-euclidean', and 'log-manhattan'
3372
    :param tuple cost: a 4-tuple representing the cost of the four possible
3373
        edits: inserts, deletes, substitutions, and shift, respectively (by
3374
        default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be
3375
        significantly less than the cost of an insertion & deletion unless
3376
        a log metric is used.
3377
    :return: normalized typo similarity
3378
    :rtype: float
3379
    """
3380
    return 1 - dist_typo(src, tar, metric, cost)
3381
3382
3383
def dist_indel(src, tar):
3384
    """Return the indel distance between two strings.
3385
3386
    This is equivalent to levenshtein distance, when only inserts and deletes
3387
    are possible.
3388
3389
    :param str src, tar: two strings to be compared
3390
    :return: indel distance
3391
    :rtype: float
3392
    """
3393
    return dist_levenshtein(src, tar, mode='lev', cost=(1, 1, 9999, 9999))
3394
3395
3396
def sim_indel(src, tar):
3397
    """Return the indel similarity of two strings.
3398
3399
    Normalized bag similarity is the complement of normalized bag distance:
3400
    :math:`sim_{bag} = 1 - dist_{bag}`
3401
3402
    :param str src, tar: two strings to be compared
3403
    :return: indel similarity
3404
    :rtype: float
3405
    """
3406
    return sim_levenshtein(src, tar, mode='lev', cost=(1, 1, 9999, 9999))
3407
3408
3409
def _synoname_strip_punct(word):
3410
    """Return a word with punctuation stripped out.
3411
3412
    :param word:
3413
    :return:
3414
    """
3415
    stripped = ''
3416
    for char in word:
3417
        if char not in set(',-./:;"&\'()!{|}?$%*+<=>[\\]^_`~'):
3418
            stripped += char
3419
    return stripped.strip()
3420
3421
3422
def synoname_word_approximation(src_ln, tar_ln, src_fn='', tar_fn='', features=None):
3423
    """
3424
3425
    :return:
3426
    """
3427
    if features is None:
3428
        features = {}
3429
    if 'src_specials' not in features:
3430
        features['src_specials'] = []
3431
    if 'tar_specials' not in features:
3432
        features['tar_specials'] = []
3433
3434
    src_len_specials = len(features['src_specials'])
3435
    tar_len_specials = len(features['tar_specials'])
3436
3437
    # 1
3438
    if ('gen_conflict' not in features or features['gen_conflict'] or
3439
            'roman_conflict' not in features or features['roman_conflict']):
3440
        return 0
3441
3442
    # 3 & 7
3443
    full_tar1 = ' '.join((tar_ln, tar_fn)).replace('-', ' ')
3444
    for s_type, s_pos in features['tar_specials']:
3445
        if s_pos == 'a':
3446
            full_tar1 = full_tar1[:1+len(_synoname_special_table[s_type][1])]
3447
        elif s_pos == 'b':
3448
            loc = full_tar1.find(' '+_synoname_special_table[s_type][1]+' ')+1
3449
            full_tar1 = full_tar1[:loc]+full_tar1[loc+len(_synoname_special_table[s_type][1]):]
3450
        elif s_pos == 'c':
3451
            full_tar1 = full_tar1[1+len(_synoname_special_table[s_type][1]):]
3452
3453
    full_src1 = ' '.join((src_ln, src_fn)).replace('-', ' ')
3454
    for s_type, s_pos in features['src_specials']:
3455
        if s_pos == 'a':
3456
            full_src1 = full_src1[:1+len(_synoname_special_table[s_type][1])]
3457
        elif s_pos == 'b':
3458
            loc = full_src1.find(' '+_synoname_special_table[s_type][1]+' ')+1
3459
            full_src1 = full_src1[:loc]+full_src1[loc+len(_synoname_special_table[s_type][1]):]
3460
        elif s_pos == 'c':
3461
            full_src1 = full_src1[1+len(_synoname_special_table[s_type][1]):]
3462
3463
    full_tar2 = full_tar1
3464
    for s_type, s_pos in features['tar_specials']:
3465
        if s_pos == 'd':
3466
            full_tar2 = full_tar2[len(_synoname_special_table[s_type][1]):]
3467
        elif s_pos == 'X' and _synoname_special_table[s_type][1] in full_tar2:
3468
            loc = full_tar2.find(' ' + _synoname_special_table[s_type][1] + ' ') + 1
3469
            full_tar2 = full_tar2[:loc] + full_tar2[loc + len(_synoname_special_table[s_type][1]):]
3470
3471
    full_src2 = full_tar1
3472
    for s_type, s_pos in features['src_specials']:
3473
        if s_pos == 'd':
3474
            full_src2 = full_src2[len(_synoname_special_table[s_type][1]):]
3475
        elif s_pos == 'X' and _synoname_special_table[s_type][1] in full_src2:
3476
            loc = full_src2.find(' ' + _synoname_special_table[s_type][1] + ' ') + 1
3477
            full_src2 = full_src2[:loc] + full_src2[loc + len(_synoname_special_table[s_type][1]):]
3478
3479
    full_tar1 = _synoname_strip_punct(full_tar1)
3480
    tar1_words = full_tar1.split()
3481
    tar1_num_words = len(tar1_words)
3482
3483
    full_src1 = _synoname_strip_punct(full_src1)
3484
    src1_words = full_src1.split()
3485
    src1_num_words = len(src1_words)
3486
3487
    full_tar2 = _synoname_strip_punct(full_tar2)
3488
    tar2_words = full_tar2.split()
3489
    tar2_num_words = len(tar2_words)
3490
3491
    full_src2 = _synoname_strip_punct(full_src2)
3492
    src2_words = full_src2.split()
3493
    src2_num_words = len(src2_words)
3494
3495
    # 2
3496
    if src1_num_words < 2 and src_len_specials == 0 and src2_num_words < 2 and tar_len_specials == 0:
3497
        return 0
3498
3499
    # 4
3500
    if tar1_num_words == 1 and src1_num_words == 1 and tar1_words[0] == src1_words[0]:
3501
        return 1
3502
    if tar1_num_words < 2 and tar_len_specials == 0:
3503
        return 0
3504
3505
    # 5
3506
    last_found = False
3507
    for word in tar1_words:
3508
        if src_ln.endswith(word) or word+' ' in src_ln:
3509
            last_found = True
3510
3511
    if not last_found:
3512
        for word in src1_words:
3513
            if tar_ln.endswith(word) or word+' ' in tar_ln:
3514
                last_found = True
3515
3516
    # 6
3517
    matches = 0
3518
    if last_found:
3519
        for i, s_word in enumerate(src1_words):
3520
            for j, t_word in enumerate(tar1_words):
3521
                if s_word == t_word:
3522
                    src1_words[i] = '@'
3523
                    tar1_words[i] = '@'
3524
                    matches += 1
3525
    w_ratio = matches/max(tar1_num_words, src1_num_words)
3526
    if matches > 1 or (matches == 1 and
3527
                       src1_num_words == 1 and tar1_num_words == 1 and
3528
                       (tar_len_specials > 0 or src_len_specials > 0)):
3529
        return w_ratio
3530
3531
    # 8
3532
    if tar2_num_words == 1 and src2_num_words == 1 and tar2_words[0] == src2_words[0]:
3533
        return 1
3534
    if tar2_num_words < 2 and tar_len_specials == 0:
3535
        return 0
3536
3537
    # 9
3538
    last_found = False
3539
    for word in tar2_words:
3540
        if src_ln.endswith(word) or word+' ' in src_ln:
3541
            last_found = True
3542
3543
    if not last_found:
3544
        for word in src2_words:
3545
            if tar_ln.endswith(word) or word+' ' in tar_ln:
3546
                last_found = True
3547
3548
    if not last_found:
3549
        return 0
3550
3551
    # 10
3552
    matches = 0
3553
    if last_found:
3554
        for i, s_word in enumerate(src2_words):
3555
            for j, t_word in enumerate(tar2_words):
3556
                if s_word == t_word:
3557
                    src2_words[i] = '@'
3558
                    tar2_words[i] = '@'
3559
                    matches += 1
3560
    w_ratio = matches/max(tar2_num_words, src2_num_words)
3561
    if matches > 1 or (matches == 1 and
3562
                       src2_num_words == 1 and tar2_num_words == 1 and
3563
                       (tar_len_specials > 0 or src_len_specials > 0)):
3564
        return w_ratio
3565
3566
    return 0
3567
3568
3569
def synoname(src, tar, word_approx_min=0.3, char_approx_min=0.73,
3570
             tests=2**11-1):
3571
    """Return the Synoname similarity type of two words.
3572
3573
    Cf. :cite:`Getty:1991,Gross:1991`
3574
3575
    :param str src, tar: two strings to be compared
3576
    :return: Synoname value
3577
    :rtype: int
3578
    """
3579
    test_dict = {val: n**2 for n, val in enumerate([
3580
        'exact', 'omission', 'substitution', 'transposition', 'punctuation',
3581
        'initials', 'extended', 'inclusion', 'no_first', 'word_approx',
3582
        'confusions', 'char_approx'])}
3583
    match_type_dict = {val: n for n, val in enumerate([
3584
        'exact', 'omission', 'substitution', 'transposition', 'punctuation',
3585
        'initials', 'extended', 'inclusion', 'no_first', 'word_approx',
3586
        'confusions', 'char_approx', 'no_match'], 1)}
3587
3588
    if isinstance(tests, Iterable):
3589
        new_tests = 0
3590
        for term in tests:
3591
            if term in test_dict:
3592
                new_tests += test_dict[term]
3593
        tests = new_tests
3594
3595
    if isinstance(src, tuple):
3596
        src_ln, src_fn, src_qual = src
3597
    else:
3598
        src_ln, src_fn, src_qual = src.split('#')[1:4]
3599
    if isinstance(tar, tuple):
3600
        tar_ln, tar_fn, tar_qual = tar
3601
    else:
3602
        tar_ln, tar_fn, tar_qual = tar.split('#')[1:4]
3603
3604
    def split_special(spec):
3605
        spec_list = []
3606
        while spec:
3607
            spec_list.append((int(spec[:3]), spec[3:4]))
3608
            spec = spec[4:]
3609
        return spec_list
3610
3611
    # 1. Preprocessing
3612
3613
    # Lowercasing
3614
    src_fn = src_fn.strip().lower()
3615
    src_ln = src_ln.strip().lower()
3616
    src_qual = src_qual.strip().lower()
3617
3618
    tar_fn = tar_fn.strip().lower()
3619
    tar_ln = tar_ln.strip().lower()
3620
    tar_qual = tar_qual.strip().lower()
3621
3622
    # Create toolcodes
3623
    src_fn, src_ln, src_tc = synoname_toolcode(src_fn, src_ln, src_qual)
3624
    tar_fn, tar_ln, tar_tc = synoname_toolcode(tar_fn, tar_ln, tar_qual)
3625
3626
    src_generation = int(src_tc[2])
3627
    src_romancode = int(src_tc[3:6])
3628
    src_len_fn = int(src_tc[6:8])
3629
    src_tc = src_tc.split('#')
3630
    src_specials = split_special(src_tc[1])
3631
3632
    tar_generation = int(tar_tc[2])
3633
    tar_romancode = int(tar_tc[3:6])
3634
    tar_len_fn = int(tar_tc[6:8])
3635
    tar_tc = tar_tc.split('#')
3636
    tar_specials = split_special(tar_tc[1])
3637
3638
    gen_conflict = (src_generation != tar_generation and
3639
                    (src_generation or tar_generation))
3640
    roman_conflict = (src_romancode != tar_romancode and
3641
                      (src_romancode or tar_romancode))
3642
3643
    ln_equal = src_ln == tar_ln
3644
    fn_equal = src_fn == tar_fn
3645
3646
    # approx_c
3647
    def approx_c():
3648
        if gen_conflict or roman_conflict:
3649
            return 0
3650
3651
        full_src = ' '.join((src_ln, src_fn))
3652
        if full_src.startswith('master '):
3653
            full_src = full_src[len('master '):]
3654
            for intro in ['of the ', 'of ', 'known as the ', 'with the ',
3655
                          'with ']:
3656
                if full_src.ssrctswith(intro):
3657
                    full_src = full_src[len(intro):]
3658
3659
        full_tar = ' '.join((tar_ln, tar_fn))
3660
        if full_tar.startswith('master '):
3661
            full_tar = full_tar[len('master '):]
3662
            for intro in ['of the ', 'of ', 'known as the ', 'with the ',
3663
                          'with ']:
3664
                if full_tar.startswith(intro):
3665
                    full_tar = full_tar[len(intro):]
3666
3667
        ca_ratio = sim_ratcliff_obershelp(full_src, full_tar)
3668
        return ca_ratio >= char_approx_min, ca_ratio
3669
3670
    approx_c_result, ca_ratio = approx_c()
3671
3672
    if tests & test_dict['exact'] and fn_equal and ln_equal:
3673
        return match_type_dict['exact']
3674
    if tests & test_dict['omission']:
3675
        if (fn_equal and
3676
                levenshtein(src_ln, tar_ln, cost=(1, 1, 99, 99)) == 1):
3677
            if not roman_conflict:
3678
                return match_type_dict['omission']
3679
        elif (ln_equal and
3680
              levenshtein(src_fn, tar_fn, cost=(1, 1, 99, 99)) == 1):
3681
            return match_type_dict['omission']
3682
    if tests & test_dict['substitution']:
3683
        if (fn_equal and
3684
                levenshtein(src_ln, tar_ln, cost=(99, 99, 1, 99)) == 1):
3685
            return match_type_dict['substitution']
3686
        elif (ln_equal and
3687
              levenshtein(src_fn, tar_fn, cost=(99, 99, 1, 99)) == 1):
3688
            return match_type_dict['substitution']
3689
    if tests & test_dict['transposition']:
3690
        if (fn_equal and
3691
                levenshtein(src_ln, tar_ln, cost=(99, 99, 99, 1)) == 1):
3692
            return match_type_dict['transposition']
3693
        elif (ln_equal and
3694
              levenshtein(src_fn, tar_fn, cost=(99, 99, 99, 1)) == 1):
3695
            return match_type_dict['transposition']
3696
    if tests & test_dict['punctuation']:
3697
        np_src_fn = _synoname_strip_punct(src_fn)
3698
        np_tar_fn = _synoname_strip_punct(tar_fn)
3699
        np_src_ln = _synoname_strip_punct(src_ln)
3700
        np_tar_ln = _synoname_strip_punct(tar_ln)
3701
3702
        if np_src_fn == np_tar_fn and np_src_ln == np_tar_ln:
3703
            return match_type_dict['punctuation']
3704
    if tests & test_dict['initials'] and ln_equal:
3705
        if src_fn or tar_fn:
3706
            src_initials = ''.join(_[0] for _ in src_fn.split())
3707
            tar_initials = ''.join(_[0] for _ in tar_fn.split())
3708
            if src_initials == tar_initials:
3709
                return match_type_dict['initials']
3710
            initial_diff = abs(len(src_initials)-len(tar_initials))
3711
            if (initial_diff and
3712
                    ((initial_diff == levenshtein(src_initials, tar_initials,
3713
                                                 cost=(1, 99, 99, 99))) or
3714
                     (initial_diff == levenshtein(tar_initials, src_initials,
3715
                                                  cost=(1, 99, 99, 99))))):
3716
                return match_type_dict['initials']
3717
    if tests & test_dict['extended']:
3718
        if src_ln[0] == tar_ln[0] and (src_ln.startswith(tar_ln) or
3719
                                       tar_ln.startswith(src_ln)):
3720
            if ((not src_len_fn and not tar_len_fn) or
3721
                    src_ln.startswith(tar_ln) or
3722
                    tar_ln.startswith(src_ln)) and not roman_conflict:
3723
                return match_type_dict['extended']
3724
    if tests & test_dict['inclusion'] and ln_equal:
3725
        if src_fn in tar_fn or tar_fn in src_ln:
3726
            return match_type_dict['inclusion']
3727
    if tests & test_dict['no_first'] and ln_equal:
3728
        if src_fn == '' or tar_fn == '':
3729
            return match_type_dict['no_first']
3730
    if tests & test_dict['word_approx']:
3731
        ratio = synoname_word_approximation(src_ln, tar_ln, src_fn, tar_fn,
3732
                                            {'gen_conflict': gen_conflict,
3733
                                             'roman_conflict': roman_conflict,
3734
                                             'src_specials': src_specials,
3735
                                             'tar_specials': tar_specials})
3736
        if ratio == 1 and tests & test_dict['confusions']:
3737
            if ' '.join((src_fn, src_ln)) == ' '.join((tar_fn, tar_ln)):
3738
                return match_type_dict['confusions']
3739
        if ratio >= word_approx_min:
3740
            return match_type_dict['word_approx']
3741
    if tests & test_dict['char_approx']:
3742
        if ca_ratio >= char_approx_min:
3743
            return match_type_dict['char_approx']
3744
    return match_type_dict['no_match']
3745
3746
3747
def sim_tfidf(src, tar, qval=2, docs_src=None, docs_tar=None):
3748
    """Return the TF-IDF similarity of two strings.
3749
3750
    TF-IDF similarity
3751
3752
    This is chiefly based on the "Formal Definition of TF/IDF Distance" at:
3753
    http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/TfIdfDistance.html
3754
3755
    :param str src, tar: two strings to be compared (or QGrams/Counter objects)
3756
    :param int qval: the length of each q-gram; 0 or None for non-q-gram
3757
        version
3758
    :param Counter docs_src: a Counter object or string representing the
3759
        document corpus for the src string
3760
    :param Counter docs_tar: a Counter object or string representing the
3761
        document corpus for the tar string (or set to None to use the docs_src
3762
        for both)
3763
    :returns: TF-IDF similarity
3764
    :rtype: float
3765
    """
3766
    if src == tar:
3767
        return 1.0  # TODO: confirm correctness of this when docs are different
3768
    elif not src or not tar:
3769
        return 0.0
3770
3771
    q_src, q_tar = _get_qgrams(src, tar, qval)
3772
3773
    if isinstance(docs_src, Counter):
3774
        q_docs = docs_src
3775
    elif qval and qval > 0:
3776
        q_docs = QGrams(docs_src, qval)
3777
    else:
3778
        q_docs = Counter(docs_src.strip().split())
3779
3780
    if not q_src or not q_tar:
3781
        return 0.0
3782
3783
    # TODO: finish implementation
3784
    return 0.5  # hardcoded to half
3785
3786
###############################################################################
3787
3788
3789
def sim(src, tar, method=sim_levenshtein):
3790
    """Return a similarity of two strings.
3791
3792
    This is a generalized function for calling other similarity functions.
3793
3794
    :param str src, tar: two strings to be compared
3795
    :param function method: specifies the similarity metric (Levenshtein by
3796
        default)
3797
    :returns: similarity according to the specified function
3798
    :rtype: float
3799
3800
    >>> round(sim('cat', 'hat'), 12)
3801
    0.666666666667
3802
    >>> round(sim('Niall', 'Neil'), 12)
3803
    0.4
3804
    >>> sim('aluminum', 'Catalan')
3805
    0.125
3806
    >>> sim('ATCG', 'TAGC')
3807
    0.25
3808
    """
3809
    if callable(method):
3810
        return method(src, tar)
3811
    else:
3812
        raise AttributeError('Unknown similarity function: ' + str(method))
3813
3814
3815
def dist(src, tar, method=sim_levenshtein):
3816
    """Return a distance between two strings.
3817
3818
    This is a generalized function for calling other distance functions.
3819
3820
    :param str src, tar: two strings to be compared
3821
    :param function method: specifies the similarity metric (Levenshtein by
3822
        default) -- Note that this takes a similarity metric function, not
3823
        a distance metric function.
3824
    :returns: distance according to the specified function
3825
    :rtype: float
3826
3827
    >>> round(dist('cat', 'hat'), 12)
3828
    0.333333333333
3829
    >>> round(dist('Niall', 'Neil'), 12)
3830
    0.6
3831
    >>> dist('aluminum', 'Catalan')
3832
    0.875
3833
    >>> dist('ATCG', 'TAGC')
3834
    0.75
3835
    """
3836
    if callable(method):
3837
        return 1 - method(src, tar)
3838
    else:
3839
        raise AttributeError('Unknown distance function: ' + str(method))
3840
3841
3842
if __name__ == '__main__':
3843
    import doctest
3844
    doctest.testmod()
3845