Completed
Push — master ( cd9fca...2c922f )
by Chris
13:16
created

abydos.distance.dist_chebyshev()   A

Complexity

Conditions 1

Size

Total Lines 14
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 4
dl 0
loc 14
rs 10
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.distance.
20
21
The distance module implements string edit distance functions including:
22
23
    - Levenshtein distance
24
    - Optimal String Alignment distance
25
    - Levenshtein-Damerau distance
26
    - Hamming distance
27
    - Tversky index
28
    - Sørensen–Dice coefficient & distance
29
    - Jaccard similarity coefficient & distance
30
    - overlap similarity & distance
31
    - Tanimoto coefficient & distance
32
    - Minkowski distance & similarity
33
    - Manhattan distance & similarity
34
    - Euclidean distance & similarity
35
    - Chebyshev distance & similarity
36
    - cosine similarity & distance
37
    - Jaro distance
38
    - Jaro-Winkler distance (incl. the strcmp95 algorithm variant)
39
    - Longest common substring
40
    - Ratcliff-Obershelp similarity & distance
41
    - Match Rating Algorithm similarity
42
    - Normalized Compression Distance (NCD) & similarity
43
    - Monge-Elkan similarity & distance
44
    - Matrix similarity
45
    - Needleman-Wunsch score
46
    - Smith-Waterman score
47
    - Gotoh score
48
    - Length similarity
49
    - Prefix, Suffix, and Identity similarity & distance
50
    - Modified Language-Independent Product Name Search (MLIPNS) similarity &
51
      distance
52
    - Bag distance
53
    - Editex distance
54
    - Eudex distances
55
    - Sift4 distance
56
    - Baystat distance & similarity
57
    - Typo distance
58
    - Indel distance
59
    - Synoname
60
61
Functions beginning with the prefixes 'sim' and 'dist' are guaranteed to be
62
in the range [0, 1], and sim_X = 1 - dist_X since the two are complements.
63
If a sim_X function is supplied identical src & tar arguments, it is guaranteed
64
to return 1; the corresponding dist_X function is guaranteed to return 0.
65
"""
66
67
from __future__ import division, unicode_literals
68
69
from codecs import encode
70
from collections import Counter, Iterable, defaultdict
71
from math import log, sqrt
72
from numbers import Number
73
from sys import maxsize, modules
74
from types import GeneratorType
75
from unicodedata import normalize as unicode_normalize
76
77
from numpy import float32 as np_float32
78
from numpy import int as np_int
79
from numpy import zeros as np_zeros
80
81
from six import text_type
82
from six.moves import range
83
84
from .compression import ac_encode, ac_train, rle_encode
85
# noinspection PyProtectedMember
86
from .fingerprint import _synoname_special_table, synoname_toolcode
87
from .phonetic import eudex, mra
88
from .qgram import QGrams
89
90
try:
91
    import lzma
92
except ImportError:  # pragma: no cover
93
    # If the system lacks the lzma library, that's fine, but lzma compression
94
    # similarity won't be supported.
95
    lzma = None
96
97
__all__ = ['bag', 'chebyshev', 'damerau_levenshtein', 'dist', 'dist_bag',
98
           'dist_baystat', 'dist_compression', 'dist_cosine', 'dist_damerau',
99
           'dist_dice', 'dist_editex', 'dist_euclidean', 'dist_eudex',
100
           'dist_hamming', 'dist_ident', 'dist_indel', 'dist_jaccard',
101
           'dist_jaro_winkler', 'dist_lcsseq', 'dist_lcsstr', 'dist_length',
102
           'dist_levenshtein', 'dist_manhattan', 'dist_minkowski',
103
           'dist_mlipns', 'dist_monge_elkan', 'dist_mra',
104
           'dist_overlap', 'dist_prefix', 'dist_ratcliff_obershelp',
105
           'dist_sift4', 'dist_strcmp95', 'dist_suffix', 'dist_tversky',
106
           'dist_typo', 'editex', 'euclidean', 'eudex', 'eudex_hamming',
107
           'gotoh', 'hamming', 'lcsseq', 'lcsstr', 'levenshtein', 'manhattan',
108
           'minkowski', 'mra_compare', 'needleman_wunsch', 'sift4_common',
109
           'sift4_simplest', 'sim', 'sim_bag', 'sim_baystat',
110
           'sim_compression', 'sim_cosine', 'sim_damerau', 'sim_dice',
111
           'sim_editex', 'sim_euclidean', 'sim_eudex', 'sim_hamming',
112
           'sim_ident', 'sim_indel', 'sim_jaccard', 'sim_jaro_winkler',
113
           'sim_lcsseq', 'sim_lcsstr', 'sim_length', 'sim_levenshtein',
114
           'sim_manhattan', 'sim_matrix', 'sim_minkowski', 'sim_mlipns',
115
           'sim_monge_elkan', 'sim_mra', 'sim_overlap', 'sim_prefix',
116
           'sim_ratcliff_obershelp', 'sim_sift4', 'sim_strcmp95', 'sim_suffix',
117
           'sim_tanimoto', 'sim_tversky', 'sim_typo', 'smith_waterman',
118
           'synoname', '_synoname_word_approximation', 'tanimoto', 'typo']
119
120
121
def levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
122
    """Return the Levenshtein distance between two strings.
123
124
    This is the standard edit distance measure. Cf.
125
    :cite:`Levenshtein:1965,Levenshtein:1966`.
126
127
    Two additional variants: optimal string alignment (aka restricted
128
    Damerau-Levenshtein distance) :cite:`Boytsov:2011` and the
129
    Damerau-Levenshtein :cite:`Damerau:1964` distance are also supported.
130
131
    The ordinary Levenshtein & Optimal String Alignment distance both
132
    employ the Wagner-Fischer dynamic programming algorithm
133
    :cite:`Wagner:1974`.
134
135
    Levenshtein edit distance ordinarily has unit insertion, deletion, and
136
    substitution costs.
137
138
    :param str src: source string for comparison
139
    :param str tar: target string for comparison
140
    :param str mode: specifies a mode for computing the Levenshtein distance:
141
142
        - 'lev' (default) computes the ordinary Levenshtein distance,
143
          in which edits may include inserts, deletes, and substitutions
144
        - 'osa' computes the Optimal String Alignment distance, in which
145
          edits may include inserts, deletes, substitutions, and
146
          transpositions but substrings may only be edited once
147
        - 'dam' computes the Damerau-Levenshtein distance, in which
148
          edits may include inserts, deletes, substitutions, and
149
          transpositions and substrings may undergo repeated edits
150
151
    :param tuple cost: a 4-tuple representing the cost of the four possible
152
        edits: inserts, deletes, substitutions, and transpositions,
153
        respectively (by default: (1, 1, 1, 1))
154
    :returns: the Levenshtein distance between src & tar
155
    :rtype: int (may return a float if cost has float values)
156
157
    >>> levenshtein('cat', 'hat')
158
    1
159
    >>> levenshtein('Niall', 'Neil')
160
    3
161
    >>> levenshtein('aluminum', 'Catalan')
162
    7
163
    >>> levenshtein('ATCG', 'TAGC')
164
    3
165
166
    >>> levenshtein('ATCG', 'TAGC', mode='osa')
167
    2
168
    >>> levenshtein('ACTG', 'TAGC', mode='osa')
169
    4
170
171
    >>> levenshtein('ATCG', 'TAGC', mode='dam')
172
    2
173
    >>> levenshtein('ACTG', 'TAGC', mode='dam')
174
    3
175
    """
176
    ins_cost, del_cost, sub_cost, trans_cost = cost
177
178
    if src == tar:
179
        return 0
180
    if not src:
181
        return len(tar) * ins_cost
182
    if not tar:
183
        return len(src) * del_cost
184
185
    if 'dam' in mode:
186
        return damerau_levenshtein(src, tar, cost)
187
188
    d_mat = np_zeros((len(src)+1, len(tar)+1), dtype=np_int)
189
    for i in range(len(src)+1):
190
        d_mat[i, 0] = i * del_cost
191
    for j in range(len(tar)+1):
192
        d_mat[0, j] = j * ins_cost
193
194
    for i in range(len(src)):
195
        for j in range(len(tar)):
196
            d_mat[i+1, j+1] = min(
197
                d_mat[i+1, j] + ins_cost,  # ins
198
                d_mat[i, j+1] + del_cost,  # del
199
                d_mat[i, j] + (sub_cost if src[i] != tar[j] else 0)  # sub/==
200
            )
201
202
            if mode == 'osa':
203
                if ((i+1 > 1 and j+1 > 1 and src[i] == tar[j-1] and
204
                     src[i-1] == tar[j])):
205
                    # transposition
206
                    d_mat[i+1, j+1] = min(d_mat[i+1, j+1],
207
                                          d_mat[i-1, j-1] + trans_cost)
208
209
    return d_mat[len(src), len(tar)]
210
211
212
def dist_levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
213
    """Return the normalized Levenshtein distance between two strings.
214
215
    The Levenshtein distance is normalized by dividing the Levenshtein distance
216
    (calculated by any of the three supported methods) by the greater of
217
    the number of characters in src times the cost of a delete and
218
    the number of characters in tar times the cost of an insert.
219
    For the case in which all operations have :math:`cost = 1`, this is
220
    equivalent to the greater of the length of the two strings src & tar.
221
222
    :param str src: source string for comparison
223
    :param str tar: target string for comparison
224
    :param str mode: specifies a mode for computing the Levenshtein distance:
225
226
        - 'lev' (default) computes the ordinary Levenshtein distance,
227
          in which edits may include inserts, deletes, and substitutions
228
        - 'osa' computes the Optimal String Alignment distance, in which
229
          edits may include inserts, deletes, substitutions, and
230
          transpositions but substrings may only be edited once
231
        - 'dam' computes the Damerau-Levenshtein distance, in which
232
          edits may include inserts, deletes, substitutions, and
233
          transpositions and substrings may undergo repeated edits
234
235
    :param tuple cost: a 4-tuple representing the cost of the four possible
236
        edits: inserts, deletes, substitutions, and transpositions,
237
        respectively (by default: (1, 1, 1, 1))
238
    :returns: normalized Levenshtein distance
239
    :rtype: float
240
241
    >>> round(dist_levenshtein('cat', 'hat'), 12)
242
    0.333333333333
243
    >>> round(dist_levenshtein('Niall', 'Neil'), 12)
244
    0.6
245
    >>> dist_levenshtein('aluminum', 'Catalan')
246
    0.875
247
    >>> dist_levenshtein('ATCG', 'TAGC')
248
    0.75
249
    """
250
    if src == tar:
251
        return 0
252
    ins_cost, del_cost = cost[:2]
253
    return (levenshtein(src, tar, mode, cost) /
254
            (max(len(src)*del_cost, len(tar)*ins_cost)))
255
256
257
def sim_levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
258
    """Return the Levenshtein similarity of two strings.
259
260
    Normalized Levenshtein similarity is the complement of normalized
261
    Levenshtein distance:
262
    :math:`sim_{Levenshtein} = 1 - dist_{Levenshtein}`.
263
264
    :param str src: source string for comparison
265
    :param str tar: target string for comparison
266
    :param str mode: specifies a mode for computing the Levenshtein distance:
267
268
            - 'lev' (default) computes the ordinary Levenshtein distance,
269
              in which edits may include inserts, deletes, and substitutions
270
            - 'osa' computes the Optimal String Alignment distance, in which
271
              edits may include inserts, deletes, substitutions, and
272
              transpositions but substrings may only be edited once
273
            - 'dam' computes the Damerau-Levenshtein distance, in which
274
              edits may include inserts, deletes, substitutions, and
275
              transpositions and substrings may undergo repeated edits
276
277
    :param tuple cost: a 4-tuple representing the cost of the four possible
278
        edits:
279
        inserts, deletes, substitutions, and transpositions, respectively
280
        (by default: (1, 1, 1, 1))
281
    :returns: normalized Levenshtein similarity
282
    :rtype: float
283
284
    >>> round(sim_levenshtein('cat', 'hat'), 12)
285
    0.666666666667
286
    >>> round(sim_levenshtein('Niall', 'Neil'), 12)
287
    0.4
288
    >>> sim_levenshtein('aluminum', 'Catalan')
289
    0.125
290
    >>> sim_levenshtein('ATCG', 'TAGC')
291
    0.25
292
    """
293
    return 1 - dist_levenshtein(src, tar, mode, cost)
294
295
296
def damerau_levenshtein(src, tar, cost=(1, 1, 1, 1)):
297
    """Return the Damerau-Levenshtein distance between two strings.
298
299
    This computes the Damerau-Levenshtein distance :cite:`Damerau:1964`.
300
    Damerau-Levenshtein code is based on Java code by Kevin L. Stern
301
    :cite:`Stern:2014`, under the MIT license:
302
    https://github.com/KevinStern/software-and-algorithms/blob/master/src/main/java/blogspot/software_and_algorithms/stern_library/string/DamerauLevenshteinAlgorithm.java
303
304
    :param str src: source string for comparison
305
    :param str tar: target string for comparison
306
    :param tuple cost: a 4-tuple representing the cost of the four possible
307
        edits:
308
        inserts, deletes, substitutions, and transpositions, respectively
309
        (by default: (1, 1, 1, 1))
310
    :returns: the Damerau-Levenshtein distance between src & tar
311
    :rtype: int (may return a float if cost has float values)
312
313
    >>> damerau_levenshtein('cat', 'hat')
314
    1
315
    >>> damerau_levenshtein('Niall', 'Neil')
316
    3
317
    >>> damerau_levenshtein('aluminum', 'Catalan')
318
    7
319
    >>> damerau_levenshtein('ATCG', 'TAGC')
320
    2
321
    """
322
    ins_cost, del_cost, sub_cost, trans_cost = cost
323
324
    if src == tar:
325
        return 0
326
    if not src:
327
        return len(tar) * ins_cost
328
    if not tar:
329
        return len(src) * del_cost
330
331
    if 2*trans_cost < ins_cost + del_cost:
332
        raise ValueError('Unsupported cost assignment; the cost of two ' +
333
                         'transpositions must not be less than the cost of ' +
334
                         'an insert plus a delete.')
335
336
    d_mat = (np_zeros((len(src))*(len(tar)), dtype=np_int).
337
             reshape((len(src), len(tar))))
338
339
    if src[0] != tar[0]:
340
        d_mat[0, 0] = min(sub_cost, ins_cost + del_cost)
341
342
    src_index_by_character = {src[0]: 0}
343
    for i in range(1, len(src)):
344
        del_distance = d_mat[i-1, 0] + del_cost
345
        ins_distance = (i+1) * del_cost + ins_cost
346
        match_distance = (i * del_cost +
347
                          (0 if src[i] == tar[0] else sub_cost))
348
        d_mat[i, 0] = min(del_distance, ins_distance, match_distance)
349
350
    for j in range(1, len(tar)):
351
        del_distance = (j+1) * ins_cost + del_cost
352
        ins_distance = d_mat[0, j-1] + ins_cost
353
        match_distance = (j * ins_cost +
354
                          (0 if src[0] == tar[j] else sub_cost))
355
        d_mat[0, j] = min(del_distance, ins_distance, match_distance)
356
357
    for i in range(1, len(src)):
358
        max_src_letter_match_index = (0 if src[i] == tar[0] else -1)
359
        for j in range(1, len(tar)):
360
            candidate_swap_index = (-1 if tar[j] not in
361
                                    src_index_by_character else
362
                                    src_index_by_character[tar[j]])
363
            j_swap = max_src_letter_match_index
364
            del_distance = d_mat[i-1, j] + del_cost
365
            ins_distance = d_mat[i, j-1] + ins_cost
366
            match_distance = d_mat[i-1, j-1]
367
            if src[i] != tar[j]:
368
                match_distance += sub_cost
369
            else:
370
                max_src_letter_match_index = j
371
372
            if candidate_swap_index != -1 and j_swap != -1:
373
                i_swap = candidate_swap_index
374
375
                if i_swap == 0 and j_swap == 0:
376
                    pre_swap_cost = 0
377
                else:
378
                    pre_swap_cost = d_mat[max(0, i_swap-1), max(0, j_swap-1)]
379
                swap_distance = (pre_swap_cost + (i - i_swap - 1) *
380
                                 del_cost + (j - j_swap - 1) * ins_cost +
381
                                 trans_cost)
382
            else:
383
                swap_distance = maxsize
384
385
            d_mat[i, j] = min(del_distance, ins_distance,
386
                              match_distance, swap_distance)
387
        src_index_by_character[src[i]] = i
388
389
    return d_mat[len(src)-1, len(tar)-1]
390
391
392
def dist_damerau(src, tar, cost=(1, 1, 1, 1)):
393
    """Return the Damerau-Levenshtein similarity of two strings.
394
395
    Damerau-Levenshtein distance normalized to the interval [0, 1].
396
397
    The Damerau-Levenshtein distance is normalized by dividing the
398
    Damerau-Levenshtein distance by the greater of
399
    the number of characters in src times the cost of a delete and
400
    the number of characters in tar times the cost of an insert.
401
    For the case in which all operations have :math:`cost = 1`, this is
402
    equivalent to the greater of the length of the two strings src & tar.
403
404
    The arguments are identical to those of the levenshtein() function.
405
406
    :param str src: source string for comparison
407
    :param str tar: target string for comparison
408
    :param tuple cost: a 4-tuple representing the cost of the four possible
409
        edits:
410
        inserts, deletes, substitutions, and transpositions, respectively
411
        (by default: (1, 1, 1, 1))
412
    :returns: normalized Damerau-Levenshtein distance
413
    :rtype: float
414
415
    >>> round(dist_damerau('cat', 'hat'), 12)
416
    0.333333333333
417
    >>> round(dist_damerau('Niall', 'Neil'), 12)
418
    0.6
419
    >>> dist_damerau('aluminum', 'Catalan')
420
    0.875
421
    >>> dist_damerau('ATCG', 'TAGC')
422
    0.5
423
    """
424
    if src == tar:
425
        return 0
426
    ins_cost, del_cost = cost[:2]
427
    return (damerau_levenshtein(src, tar, cost) /
428
            (max(len(src)*del_cost, len(tar)*ins_cost)))
429
430
431
def sim_damerau(src, tar, cost=(1, 1, 1, 1)):
432
    """Return the Damerau-Levenshtein similarity of two strings.
433
434
    Normalized Damerau-Levenshtein similarity the complement of normalized
435
    Damerau-Levenshtein distance:
436
    :math:`sim_{Damerau} = 1 - dist_{Damerau}`.
437
438
    The arguments are identical to those of the levenshtein() function.
439
440
    :param str src: source string for comparison
441
    :param str tar: target string for comparison
442
    :param tuple cost: a 4-tuple representing the cost of the four possible
443
        edits:
444
        inserts, deletes, substitutions, and transpositions, respectively
445
        (by default: (1, 1, 1, 1))
446
    :returns: normalized Damerau-Levenshtein similarity
447
    :rtype: float
448
449
    >>> round(sim_damerau('cat', 'hat'), 12)
450
    0.666666666667
451
    >>> round(sim_damerau('Niall', 'Neil'), 12)
452
    0.4
453
    >>> sim_damerau('aluminum', 'Catalan')
454
    0.125
455
    >>> sim_damerau('ATCG', 'TAGC')
456
    0.5
457
    """
458
    return 1 - dist_damerau(src, tar, cost)
459
460
461
def hamming(src, tar, diff_lens=True):
462
    """Return the Hamming distance between two strings.
463
464
    Hamming distance :cite:`Hamming:1950` equals the number of character
465
    positions at which two strings differ. For strings of unequal lengths,
466
    it is not normally defined. By default, this implementation calculates the
467
    Hamming distance of the first n characters where n is the lesser of the two
468
    strings' lengths and adds to this the difference in string lengths.
469
470
    :param str src: source string for comparison
471
    :param str tar: target string for comparison
472
    :param bool diff_lens:
473
        If True (default), this returns the Hamming distance for those
474
        characters that have a matching character in both strings plus the
475
        difference in the strings' lengths. This is equivalent to extending
476
        the shorter string with obligatorily non-matching characters.
477
        If False, an exception is raised in the case of strings of unequal
478
        lengths.
479
    :returns: the Hamming distance between src & tar
480
    :rtype: int
481
482
    >>> hamming('cat', 'hat')
483
    1
484
    >>> hamming('Niall', 'Neil')
485
    3
486
    >>> hamming('aluminum', 'Catalan')
487
    8
488
    >>> hamming('ATCG', 'TAGC')
489
    4
490
    """
491
    if not diff_lens and len(src) != len(tar):
492
        raise ValueError('Undefined for sequences of unequal length; set ' +
493
                         'diff_lens to True for Hamming distance between ' +
494
                         'strings of unequal lengths.')
495
496
    hdist = 0
497
    if diff_lens:
498
        hdist += abs(len(src)-len(tar))
499
    hdist += sum(c1 != c2 for c1, c2 in zip(src, tar))
500
501
    return hdist
502
503
504
def dist_hamming(src, tar, diff_lens=True):
505
    """Return the normalized Hamming distance between two strings.
506
507
    Hamming distance normalized to the interval [0, 1].
508
509
    The Hamming distance is normalized by dividing it
510
    by the greater of the number of characters in src & tar (unless diff_lens
511
    is set to False, in which case an exception is raised).
512
513
    The arguments are identical to those of the hamming() function.
514
515
    :param str src: source string for comparison
516
    :param str tar: target string for comparison
517
    :param bool diff_lens:
518
        If True (default), this returns the Hamming distance for those
519
        characters that have a matching character in both strings plus the
520
        difference in the strings' lengths. This is equivalent to extending
521
        the shorter string with obligatorily non-matching characters.
522
        If False, an exception is raised in the case of strings of unequal
523
        lengths.
524
    :returns: normalized Hamming distance
525
    :rtype: float
526
527
    >>> round(dist_hamming('cat', 'hat'), 12)
528
    0.333333333333
529
    >>> dist_hamming('Niall', 'Neil')
530
    0.6
531
    >>> dist_hamming('aluminum', 'Catalan')
532
    1.0
533
    >>> dist_hamming('ATCG', 'TAGC')
534
    1.0
535
    """
536
    if src == tar:
537
        return 0
538
    return hamming(src, tar, diff_lens) / max(len(src), len(tar))
539
540
541
def sim_hamming(src, tar, diff_lens=True):
542
    """Return the normalized Hamming similarity of two strings.
543
544
    Hamming similarity normalized to the interval [0, 1].
545
546
    Hamming similarity is the complement of normalized Hamming distance:
547
    :math:`sim_{Hamming} = 1 - dist{Hamming}`.
548
549
    Provided that diff_lens==True, the Hamming similarity is identical to the
550
    Language-Independent Product Name Search (LIPNS) similarity score. For
551
    further information, see the sim_mlipns documentation.
552
553
    The arguments are identical to those of the hamming() function.
554
555
    :param str src: source string for comparison
556
    :param str tar: target string for comparison
557
    :param bool diff_lens:
558
        If True (default), this returns the Hamming distance for those
559
        characters that have a matching character in both strings plus the
560
        difference in the strings' lengths. This is equivalent to extending
561
        the shorter string with obligatorily non-matching characters.
562
        If False, an exception is raised in the case of strings of unequal
563
        lengths.
564
    :returns: normalized Hamming similarity
565
    :rtype: float
566
567
    >>> round(sim_hamming('cat', 'hat'), 12)
568
    0.666666666667
569
    >>> sim_hamming('Niall', 'Neil')
570
    0.4
571
    >>> sim_hamming('aluminum', 'Catalan')
572
    0.0
573
    >>> sim_hamming('ATCG', 'TAGC')
574
    0.0
575
    """
576
    return 1 - dist_hamming(src, tar, diff_lens)
577
578
579
def _get_qgrams(src, tar, qval=0, skip=0):
580
    """Return the Q-Grams in src & tar.
581
582
    :param str src: source string (or QGrams/Counter objects) for comparison
583
    :param str tar: target string (or QGrams/Counter objects) for comparison
584
    :param int qval: the length of each q-gram; 0 for non-q-gram version
585
    :param int skip: the number of characters to skip (only works when
586
        src and tar are strings
587
    :returns: Q-Grams
588
    :rtype: tuple of Counters
589
590
    >>> _get_qgrams('AT', 'TT', qval=2)
591
    (QGrams({'$A': 1, 'AT': 1, 'T#': 1}), QGrams({'$T': 1, 'TT': 1, 'T#': 1}))
592
    """
593
    if isinstance(src, Counter) and isinstance(tar, Counter):
594
        return src, tar
595
    if qval > 0:
596
        return (QGrams(src, qval, '$#', skip),
597
                QGrams(tar, qval, '$#', skip))
598
    return Counter(src.strip().split()), Counter(tar.strip().split())
599
600
601
def sim_tversky(src, tar, qval=2, alpha=1, beta=1, bias=None):
602
    r"""Return the Tversky index of two strings.
603
604
    The Tversky index :cite:`Tversky:1977` is defined as:
605
    For two sets X and Y:
606
    :math:`sim_{Tversky}(X, Y) = \\frac{|X \\cap Y|}
607
    {|X \\cap Y| + \\alpha|X - Y| + \\beta|Y - X|}`.
608
609
    :math:`\\alpha = \\beta = 1` is equivalent to the Jaccard & Tanimoto
610
    similarity coefficients.
611
612
    :math:`\\alpha = \\beta = 0.5` is equivalent to the Sørensen-Dice
613
    similarity coefficient :cite:`Dice:1945,Sorensen:1948`.
614
615
    Unequal α and β will tend to emphasize one or the other set's
616
    contributions:
617
618
        - :math:`\\alpha > \\beta` emphasizes the contributions of X over Y
619
        - :math:`\\alpha < \\beta` emphasizes the contributions of Y over X)
620
621
    Parameter values' relation to 1 emphasizes different types of
622
    contributions:
623
624
        - :math:`\\alpha and \\beta > 1` emphsize unique contributions over the
625
          intersection
626
        - :math:`\\alpha and \\beta < 1` emphsize the intersection over unique
627
          contributions
628
629
    The symmetric variant is defined in :cite:`Jiminez:2013`. This is activated
630
    by specifying a bias parameter.
631
632
    :param str src: source string (or QGrams/Counter objects) for comparison
633
    :param str tar: target string (or QGrams/Counter objects) for comparison
634
    :param int qval: the length of each q-gram; 0 for non-q-gram version
635
    :param float alpha: Tversky index parameter as described above
636
    :param float beta: Tversky index parameter as described above
637
    :param float bias: The symmetric Tversky index bias parameter
638
    :returns: Tversky similarity
639
    :rtype: float
640
641
    >>> sim_tversky('cat', 'hat')
642
    0.3333333333333333
643
    >>> sim_tversky('Niall', 'Neil')
644
    0.2222222222222222
645
    >>> sim_tversky('aluminum', 'Catalan')
646
    0.0625
647
    >>> sim_tversky('ATCG', 'TAGC')
648
    0.0
649
    """
650
    if alpha < 0 or beta < 0:
651
        raise ValueError('Unsupported weight assignment; alpha and beta ' +
652
                         'must be greater than or equal to 0.')
653
654
    if src == tar:
655
        return 1.0
656
    elif not src or not tar:
657
        return 0.0
658
659
    q_src, q_tar = _get_qgrams(src, tar, qval)
660
    q_src_mag = sum(q_src.values())
661
    q_tar_mag = sum(q_tar.values())
662
    q_intersection_mag = sum((q_src & q_tar).values())
663
664
    if not q_src or not q_tar:
665
        return 0.0
666
667
    if bias is None:
668
        return q_intersection_mag / (q_intersection_mag + alpha *
669
                                     (q_src_mag - q_intersection_mag) +
670
                                     beta * (q_tar_mag - q_intersection_mag))
671
672
    a_val = min(q_src_mag - q_intersection_mag,
673
                q_tar_mag - q_intersection_mag)
674
    b_val = max(q_src_mag - q_intersection_mag,
675
                q_tar_mag - q_intersection_mag)
676
    c_val = q_intersection_mag + bias
677
    return c_val / (beta * (alpha * a_val + (1 - alpha) * b_val) + c_val)
678
679
680
def dist_tversky(src, tar, qval=2, alpha=1, beta=1, bias=None):
681
    """Return the Tversky distance between two strings.
682
683
    Tversky distance is the complement of the Tvesrsky index (similarity):
684
    :math:`dist_{Tversky} = 1-sim_{Tversky}`.
685
686
    :param str src: source string (or QGrams/Counter objects) for comparison
687
    :param str tar: target string (or QGrams/Counter objects) for comparison
688
    :param int qval: the length of each q-gram; 0 for non-q-gram
689
        version
690
    :param float alpha: the Tversky index's alpha parameter
691
    :param float beta: the Tversky index's beta parameter
692
    :param float bias: The symmetric Tversky index bias parameter
693
    :returns: Tversky distance
694
    :rtype: float
695
696
    >>> dist_tversky('cat', 'hat')
697
    0.6666666666666667
698
    >>> dist_tversky('Niall', 'Neil')
699
    0.7777777777777778
700
    >>> dist_tversky('aluminum', 'Catalan')
701
    0.9375
702
    >>> dist_tversky('ATCG', 'TAGC')
703
    1.0
704
    """
705
    return 1 - sim_tversky(src, tar, qval, alpha, beta, bias)
706
707
708
def sim_dice(src, tar, qval=2):
709
    r"""Return the Sørensen–Dice coefficient of two strings.
710
711
    For two sets X and Y, the Sørensen–Dice coefficient
712
    :cite:`Dice:1945,Sorensen:1948` is
713
    :math:`sim_{dice}(X, Y) = \\frac{2 \\cdot |X \\cap Y|}{|X| + |Y|}`.
714
715
    This is identical to the Tanimoto similarity coefficient
716
    :cite:`Tanimoto:1958` and the Tversky index :cite:`Tversky:1977` for
717
    :math:`\\alpha = \\beta = 0.5`.
718
719
    :param str src: source string (or QGrams/Counter objects) for comparison
720
    :param str tar: target string (or QGrams/Counter objects) for comparison
721
    :param int qval: the length of each q-gram; 0 for non-q-gram
722
        version
723
    :returns: Sørensen–Dice similarity
724
    :rtype: float
725
726
    >>> sim_dice('cat', 'hat')
727
    0.5
728
    >>> sim_dice('Niall', 'Neil')
729
    0.36363636363636365
730
    >>> sim_dice('aluminum', 'Catalan')
731
    0.11764705882352941
732
    >>> sim_dice('ATCG', 'TAGC')
733
    0.0
734
    """
735
    return sim_tversky(src, tar, qval, 0.5, 0.5)
736
737
738
def dist_dice(src, tar, qval=2):
739
    """Return the Sørensen–Dice distance between two strings.
740
741
    Sørensen–Dice distance is the complemenjt of the Sørensen–Dice coefficient:
742
    :math:`dist_{dice} = 1 - sim_{dice}`.
743
744
    :param str src: source string (or QGrams/Counter objects) for comparison
745
    :param str tar: target string (or QGrams/Counter objects) for comparison
746
    :param int qval: the length of each q-gram; 0 for non-q-gram
747
        version
748
    :returns: Sørensen–Dice distance
749
    :rtype: float
750
751
    >>> dist_dice('cat', 'hat')
752
    0.5
753
    >>> dist_dice('Niall', 'Neil')
754
    0.6363636363636364
755
    >>> dist_dice('aluminum', 'Catalan')
756
    0.8823529411764706
757
    >>> dist_dice('ATCG', 'TAGC')
758
    1.0
759
    """
760
    return 1 - sim_dice(src, tar, qval)
761
762
763
def sim_jaccard(src, tar, qval=2):
764
    r"""Return the Jaccard similarity of two strings.
765
766
    For two sets X and Y, the Jaccard similarity coefficient
767
    :cite:`Jaccard:1901` is :math:`sim_{jaccard}(X, Y) =
768
    \\frac{|X \\cap Y|}{|X \\cup Y|}`.
769
770
    This is identical to the Tanimoto similarity coefficient
771
    :cite:`Tanimoto:1958`
772
    and the Tversky index :cite:`Tversky:1977` for
773
    :math:`\\alpha = \\beta = 1`.
774
775
    :param str src: source string (or QGrams/Counter objects) for comparison
776
    :param str tar: target string (or QGrams/Counter objects) for comparison
777
    :param int qval: the length of each q-gram; 0 for non-q-gram
778
        version
779
    :returns: Jaccard similarity
780
    :rtype: float
781
782
    >>> sim_jaccard('cat', 'hat')
783
    0.3333333333333333
784
    >>> sim_jaccard('Niall', 'Neil')
785
    0.2222222222222222
786
    >>> sim_jaccard('aluminum', 'Catalan')
787
    0.0625
788
    >>> sim_jaccard('ATCG', 'TAGC')
789
    0.0
790
    """
791
    return sim_tversky(src, tar, qval, 1, 1)
792
793
794
def dist_jaccard(src, tar, qval=2):
795
    """Return the Jaccard distance between two strings.
796
797
    Jaccard distance is the complement of the Jaccard similarity coefficient:
798
    :math:`dist_{Jaccard} = 1 - sim_{Jaccard}`.
799
800
    :param str src: source string (or QGrams/Counter objects) for comparison
801
    :param str tar: target string (or QGrams/Counter objects) for comparison
802
    :param int qval: the length of each q-gram; 0 for non-q-gram version
803
    :returns: Jaccard distance
804
    :rtype: float
805
806
    >>> dist_jaccard('cat', 'hat')
807
    0.6666666666666667
808
    >>> dist_jaccard('Niall', 'Neil')
809
    0.7777777777777778
810
    >>> dist_jaccard('aluminum', 'Catalan')
811
    0.9375
812
    >>> dist_jaccard('ATCG', 'TAGC')
813
    1.0
814
    """
815
    return 1 - sim_jaccard(src, tar, qval)
816
817
818
def sim_overlap(src, tar, qval=2):
819
    r"""Return the overlap coefficient of two strings.
820
821
    For two sets X and Y, the overlap coefficient
822
    :cite:`Szymkiewicz:1934,Simpson:1949`, also called the
823
    Szymkiewicz-Simpson coefficient, is
824
    :math:`sim_{overlap}(X, Y) = \\frac{|X \\cap Y|}{min(|X|, |Y|)}`.
825
826
    :param str src: source string (or QGrams/Counter objects) for comparison
827
    :param str tar: target string (or QGrams/Counter objects) for comparison
828
    :param int qval: the length of each q-gram; 0 for non-q-gram version
829
    :returns: overlap similarity
830
    :rtype: float
831
832
    >>> sim_overlap('cat', 'hat')
833
    0.5
834
    >>> sim_overlap('Niall', 'Neil')
835
    0.4
836
    >>> sim_overlap('aluminum', 'Catalan')
837
    0.125
838
    >>> sim_overlap('ATCG', 'TAGC')
839
    0.0
840
    """
841
    if src == tar:
842
        return 1.0
843
    elif not src or not tar:
844
        return 0.0
845
846
    q_src, q_tar = _get_qgrams(src, tar, qval)
847
    q_src_mag = sum(q_src.values())
848
    q_tar_mag = sum(q_tar.values())
849
    q_intersection_mag = sum((q_src & q_tar).values())
850
851
    return q_intersection_mag / min(q_src_mag, q_tar_mag)
852
853
854
def dist_overlap(src, tar, qval=2):
855
    """Return the overlap distance between two strings.
856
857
    Overlap distance is the complement of the overlap coefficient:
858
    :math:`sim_{overlap} = 1 - dist_{overlap}`.
859
860
    :param str src: source string (or QGrams/Counter objects) for comparison
861
    :param str tar: target string (or QGrams/Counter objects) for comparison
862
    :param int qval: the length of each q-gram; 0 for non-q-gram version
863
    :returns: overlap distance
864
    :rtype: float
865
866
    >>> dist_overlap('cat', 'hat')
867
    0.5
868
    >>> dist_overlap('Niall', 'Neil')
869
    0.6
870
    >>> dist_overlap('aluminum', 'Catalan')
871
    0.875
872
    >>> dist_overlap('ATCG', 'TAGC')
873
    1.0
874
    """
875
    return 1 - sim_overlap(src, tar, qval)
876
877
878
def sim_tanimoto(src, tar, qval=2):
879
    r"""Return the Tanimoto similarity of two strings.
880
881
    For two sets X and Y, the Tanimoto similarity coefficient
882
    :cite:`Tanimoto:1958` is
883
    :math:`sim_{Tanimoto}(X, Y) = \\frac{|X \\cap Y|}{|X \\cup Y|}`.
884
885
    This is identical to the Jaccard similarity coefficient
886
    :cite:`Jaccard:1901` and the Tversky index :cite:`Tversky:1977` for
887
    :math:`\\alpha = \\beta = 1`.
888
889
    :param str src: source string (or QGrams/Counter objects) for comparison
890
    :param str tar: target string (or QGrams/Counter objects) for comparison
891
    :param int qval: the length of each q-gram; 0 for non-q-gram version
892
    :returns: Tanimoto similarity
893
    :rtype: float
894
895
    >>> sim_tanimoto('cat', 'hat')
896
    0.3333333333333333
897
    >>> sim_tanimoto('Niall', 'Neil')
898
    0.2222222222222222
899
    >>> sim_tanimoto('aluminum', 'Catalan')
900
    0.0625
901
    >>> sim_tanimoto('ATCG', 'TAGC')
902
    0.0
903
    """
904
    return sim_jaccard(src, tar, qval)
905
906
907
def tanimoto(src, tar, qval=2):
908
    """Return the Tanimoto distance between two strings.
909
910
    Tanimoto distance is :math:`-log_{2}sim_{Tanimoto}`.
911
912
    :param str src: source string (or QGrams/Counter objects) for comparison
913
    :param str tar: target string (or QGrams/Counter objects) for comparison
914
    :param int qval: the length of each q-gram; 0 for non-q-gram version
915
    :returns: Tanimoto distance
916
    :rtype: float
917
918
    >>> tanimoto('cat', 'hat')
919
    -1.5849625007211563
920
    >>> tanimoto('Niall', 'Neil')
921
    -2.1699250014423126
922
    >>> tanimoto('aluminum', 'Catalan')
923
    -4.0
924
    >>> tanimoto('ATCG', 'TAGC')
925
    -inf
926
    """
927
    coeff = sim_jaccard(src, tar, qval)
928
    if coeff != 0:
929
        return log(coeff, 2)
930
931
    return float('-inf')
932
933
934
def minkowski(src, tar, qval=2, pval=1, normalized=False, alphabet=None):
935
    """Return the Minkowski distance (:math:`L^p-norm`) of two strings.
936
937
    The Minkowski distance :cite:`Minkowski:1910` is a distance metric in
938
    :math:`L^p-space`.
939
940
    :param str src: source string (or QGrams/Counter objects) for comparison
941
    :param str tar: target string (or QGrams/Counter objects) for comparison
942
    :param int qval: the length of each q-gram; 0 for non-q-gram version
943
    :param int or float pval: the :math:`p`-value of the :math:`L^p`-space.
944
    :param bool normalized: normalizes to [0, 1] if True
945
    :param collection or int alphabet: the values or size of the alphabet
946
    :returns: the Minkowski distance
947
    :rtype: float
948
949
    >>> minkowski('cat', 'hat')
950
    4.0
951
    >>> minkowski('Niall', 'Neil')
952
    7.0
953
    >>> minkowski('Colin', 'Cuilen')
954
    9.0
955
    >>> minkowski('ATCG', 'TAGC')
956
    10.0
957
    """
958
    q_src, q_tar = _get_qgrams(src, tar, qval)
959
    diffs = ((q_src - q_tar) + (q_tar - q_src)).values()
960
961
    normalizer = 1
962
    if normalized:
963
        totals = (q_src + q_tar).values()
964
        if alphabet is not None:
965
            # noinspection PyTypeChecker
966
            normalizer = (alphabet if isinstance(alphabet, Number) else
967
                          len(alphabet))
968
        elif pval == 0:
969
            normalizer = len(totals)
970
        else:
971
            normalizer = sum(_**pval for _ in totals)**(1 / pval)
972
973
    if len(diffs) == 0:
974
        return 0.0
975
    if pval == float('inf'):
976
        # Chebyshev distance
977
        return max(diffs)/normalizer
978
    if pval == 0:
979
        # This is the l_0 "norm" as developed by David Donoho
980
        return len(diffs)/normalizer
981
    return sum(_**pval for _ in diffs)**(1 / pval)/normalizer
982
983
984
def dist_minkowski(src, tar, qval=2, pval=1, alphabet=None):
985
    """Return normalized Minkowski distance of two strings.
986
987
    The normalized Minkowski distance :cite:`Minkowski:1910` is a distance
988
    metric in :math:`L^p-space`, normalized to [0, 1].
989
990
    :param str src: source string (or QGrams/Counter objects) for comparison
991
    :param str tar: target string (or QGrams/Counter objects) for comparison
992
    :param int qval: the length of each q-gram; 0 for non-q-gram version
993
    :param int or float pval: the :math:`p`-value of the :math:`L^p`-space.
994
    :param collection or int alphabet: the values or size of the alphabet
995
    :returns: the normalized Minkowski distance
996
    :rtype: float
997
998
    >>> dist_minkowski('cat', 'hat')
999
    0.5
1000
    >>> round(dist_minkowski('Niall', 'Neil'), 12)
1001
    0.636363636364
1002
    >>> round(dist_minkowski('Colin', 'Cuilen'), 12)
1003
    0.692307692308
1004
    >>> dist_minkowski('ATCG', 'TAGC')
1005
    1.0
1006
    """
1007
    return minkowski(src, tar, qval, pval, True, alphabet)
1008
1009
1010
def sim_minkowski(src, tar, qval=2, pval=1, alphabet=None):
1011
    """Return normalized Minkowski similarity of two strings.
1012
1013
    Minkowski similarity is the complement of Minkowski distance:
1014
    :math:`sim_{Minkowski} = 1 - dist_{Minkowski}`.
1015
1016
    :param str src: source string (or QGrams/Counter objects) for comparison
1017
    :param str tar: target string (or QGrams/Counter objects) for comparison
1018
    :param int qval: the length of each q-gram; 0 for non-q-gram version
1019
    :param int or float pval: the :math:`p`-value of the :math:`L^p`-space.
1020
    :param collection or int alphabet: the values or size of the alphabet
1021
    :returns: the normalized Minkowski similarity
1022
    :rtype: float
1023
1024
    >>> sim_minkowski('cat', 'hat')
1025
    0.5
1026
    >>> round(sim_minkowski('Niall', 'Neil'), 12)
1027
    0.363636363636
1028
    >>> round(sim_minkowski('Colin', 'Cuilen'), 12)
1029
    0.307692307692
1030
    >>> sim_minkowski('ATCG', 'TAGC')
1031
    0.0
1032
    """
1033
    return 1-minkowski(src, tar, qval, pval, True, alphabet)
1034
1035
1036
def manhattan(src, tar, qval=2, normalized=False, alphabet=None):
1037
    """Return the Manhattan distance between two strings.
1038
1039
    Manhattan distance is the city-block or taxi-cab distance, equivalent
1040
    to Minkowski distance in :math:`L^1`-space.
1041
1042
    :param str src: source string (or QGrams/Counter objects) for comparison
1043
    :param str tar: target string (or QGrams/Counter objects) for comparison
1044
    :param int qval: the length of each q-gram; 0 for non-q-gram version
1045
    :param normalized: normalizes to [0, 1] if True
1046
    :param collection or int alphabet: the values or size of the alphabet
1047
    :returns: the Manhattan distance
1048
    :rtype: float
1049
1050
    >>> manhattan('cat', 'hat')
1051
    4.0
1052
    >>> manhattan('Niall', 'Neil')
1053
    7.0
1054
    >>> manhattan('Colin', 'Cuilen')
1055
    9.0
1056
    >>> manhattan('ATCG', 'TAGC')
1057
    10.0
1058
    """
1059
    return minkowski(src, tar, qval, 1, normalized, alphabet)
1060
1061
1062
def dist_manhattan(src, tar, qval=2, alphabet=None):
1063
    """Return the normalized Manhattan distance between two strings.
1064
1065
    The normalized Manhattan distance is a distance
1066
    metric in :math:`L^1-space`, normalized to [0, 1].
1067
1068
    This is identical to Canberra distance.
1069
1070
    :param str src: source string (or QGrams/Counter objects) for comparison
1071
    :param str tar: target string (or QGrams/Counter objects) for comparison
1072
    :param int qval: the length of each q-gram; 0 for non-q-gram version
1073
    :param collection or int alphabet: the values or size of the alphabet
1074
    :returns: the normalized Manhattan distance
1075
    :rtype: float
1076
1077
    >>> dist_manhattan('cat', 'hat')
1078
    0.5
1079
    >>> round(dist_manhattan('Niall', 'Neil'), 12)
1080
    0.636363636364
1081
    >>> round(dist_manhattan('Colin', 'Cuilen'), 12)
1082
    0.692307692308
1083
    >>> dist_manhattan('ATCG', 'TAGC')
1084
    1.0
1085
    """
1086
    return manhattan(src, tar, qval, True, alphabet)
1087
1088
1089
def sim_manhattan(src, tar, qval=2, alphabet=None):
1090
    """Return the normalized Manhattan similarity of two strings.
1091
1092
    Manhattan similarity is the complement of Manhattan distance:
1093
    :math:`sim_{Manhattan} = 1 - dist_{Manhattan}`.
1094
1095
    :param str src: source string (or QGrams/Counter objects) for comparison
1096
    :param str tar: target string (or QGrams/Counter objects) for comparison
1097
    :param int qval: the length of each q-gram; 0 for non-q-gram version
1098
    :param collection or int alphabet: the values or size of the alphabet
1099
    :returns: the normalized Manhattan similarity
1100
    :rtype: float
1101
1102
    >>> sim_manhattan('cat', 'hat')
1103
    0.5
1104
    >>> round(sim_manhattan('Niall', 'Neil'), 12)
1105
    0.363636363636
1106
    >>> round(sim_manhattan('Colin', 'Cuilen'), 12)
1107
    0.307692307692
1108
    >>> sim_manhattan('ATCG', 'TAGC')
1109
    0.0
1110
    """
1111
    return 1-manhattan(src, tar, qval, True, alphabet)
1112
1113
1114
def euclidean(src, tar, qval=2, normalized=False, alphabet=None):
1115
    """Return the Euclidean distance between two strings.
1116
1117
    Euclidean distance is the straigh-line or as-the-crow-flies distance,
1118
    equivalent to Minkowski distance in :math:`L^2`-space.
1119
1120
    :param str src: source string (or QGrams/Counter objects) for comparison
1121
    :param str tar: target string (or QGrams/Counter objects) for comparison
1122
    :param int qval: the length of each q-gram; 0 for non-q-gram version
1123
    :param normalized: normalizes to [0, 1] if True
1124
    :param collection or int alphabet: the values or size of the alphabet
1125
    :returns: the Euclidean distance
1126
    :rtype: float
1127
1128
    >>> euclidean('cat', 'hat')
1129
    2.0
1130
    >>> round(euclidean('Niall', 'Neil'), 12)
1131
    2.645751311065
1132
    >>> euclidean('Colin', 'Cuilen')
1133
    3.0
1134
    >>> round(euclidean('ATCG', 'TAGC'), 12)
1135
    3.162277660168
1136
    """
1137
    return minkowski(src, tar, qval, 2, normalized, alphabet)
1138
1139
1140
def dist_euclidean(src, tar, qval=2, alphabet=None):
1141
    """Return the normalized Euclidean distance between two strings.
1142
1143
    The normalized Euclidean distance is a distance
1144
    metric in :math:`L^2-space`, normalized to [0, 1].
1145
1146
    :param str src: source string (or QGrams/Counter objects) for comparison
1147
    :param str tar: target string (or QGrams/Counter objects) for comparison
1148
    :param int qval: the length of each q-gram; 0 for non-q-gram version
1149
    :param collection or int alphabet: the values or size of the alphabet
1150
    :returns: the normalized Euclidean distance
1151
    :rtype: float
1152
1153
    >>> round(dist_euclidean('cat', 'hat'), 12)
1154
    0.57735026919
1155
    >>> round(dist_euclidean('Niall', 'Neil'), 12)
1156
    0.683130051064
1157
    >>> round(dist_euclidean('Colin', 'Cuilen'), 12)
1158
    0.727606875109
1159
    >>> dist_euclidean('ATCG', 'TAGC')
1160
    1.0
1161
    """
1162
    return euclidean(src, tar, qval, True, alphabet)
1163
1164
1165
def sim_euclidean(src, tar, qval=2, alphabet=None):
1166
    """Return the normalized Euclidean similarity of two strings.
1167
1168
    Euclidean similarity is the complement of Euclidean distance:
1169
    :math:`sim_{Euclidean} = 1 - dist_{Euclidean}`.
1170
1171
    :param str src: source string (or QGrams/Counter objects) for comparison
1172
    :param str tar: target string (or QGrams/Counter objects) for comparison
1173
    :param int qval: the length of each q-gram; 0 for non-q-gram version
1174
    :param collection or int alphabet: the values or size of the alphabet
1175
    :returns: the normalized Euclidean similarity
1176
    :rtype: float
1177
1178
    >>> round(sim_euclidean('cat', 'hat'), 12)
1179
    0.42264973081
1180
    >>> round(sim_euclidean('Niall', 'Neil'), 12)
1181
    0.316869948936
1182
    >>> round(sim_euclidean('Colin', 'Cuilen'), 12)
1183
    0.272393124891
1184
    >>> sim_euclidean('ATCG', 'TAGC')
1185
    0.0
1186
    """
1187
    return 1-euclidean(src, tar, qval, True, alphabet)
1188
1189
1190
def chebyshev(src, tar, qval=2, normalized=False, alphabet=None):
1191
    r"""Return the Chebyshev distance between two strings.
1192
1193
    Euclidean distance is the chessboard distance,
1194
    equivalent to Minkowski distance in :math:`L^\infty-space`.
1195
1196
    :param str src: source string (or QGrams/Counter objects) for comparison
1197
    :param str tar: target string (or QGrams/Counter objects) for comparison
1198
    :param int qval: the length of each q-gram; 0 for non-q-gram version
1199
    :param normalized: normalizes to [0, 1] if True
1200
    :param collection or int alphabet: the values or size of the alphabet
1201
    :returns: the Chebyshev distance
1202
    :rtype: float
1203
1204
    >>> chebyshev('cat', 'hat')
1205
    1.0
1206
    >>> chebyshev('Niall', 'Neil')
1207
    1.0
1208
    >>> chebyshev('Colin', 'Cuilen')
1209
    1.0
1210
    >>> chebyshev('ATCG', 'TAGC')
1211
    1.0
1212
    >>> chebyshev('ATCG', 'TAGC', qval=1)
1213
    0.0
1214
    >>> chebyshev('ATCGATTCGGAATTTC', 'TAGCATAATCGCCG', qval=1)
1215
    3.0
1216
    """
1217
    return minkowski(src, tar, qval, float('inf'), normalized, alphabet)
1218
1219
1220
def sim_cosine(src, tar, qval=2):
1221
    r"""Return the cosine similarity of two strings.
1222
1223
    For two sets X and Y, the cosine similarity, Otsuka-Ochiai coefficient, or
1224
    Ochiai coefficient :cite:`Otsuka:1936,Ochiai:1957` is:
1225
    :math:`sim_{cosine}(X, Y) = \\frac{|X \\cap Y|}{\\sqrt{|X| \\cdot |Y|}}`.
1226
1227
    :param str src: source string (or QGrams/Counter objects) for comparison
1228
    :param str tar: target string (or QGrams/Counter objects) for comparison
1229
    :param int qval: the length of each q-gram; 0 for non-q-gram version
1230
    :returns: cosine similarity
1231
    :rtype: float
1232
1233
    >>> sim_cosine('cat', 'hat')
1234
    0.5
1235
    >>> sim_cosine('Niall', 'Neil')
1236
    0.3651483716701107
1237
    >>> sim_cosine('aluminum', 'Catalan')
1238
    0.11785113019775793
1239
    >>> sim_cosine('ATCG', 'TAGC')
1240
    0.0
1241
    """
1242
    if src == tar:
1243
        return 1.0
1244
    if not src or not tar:
1245
        return 0.0
1246
1247
    q_src, q_tar = _get_qgrams(src, tar, qval)
1248
    q_src_mag = sum(q_src.values())
1249
    q_tar_mag = sum(q_tar.values())
1250
    q_intersection_mag = sum((q_src & q_tar).values())
1251
1252
    return q_intersection_mag / sqrt(q_src_mag * q_tar_mag)
1253
1254
1255
def dist_cosine(src, tar, qval=2):
1256
    """Return the cosine distance between two strings.
1257
1258
    Cosine distance is the complement of cosine similarity:
1259
    :math:`dist_{cosine} = 1 - sim_{cosine}`.
1260
1261
    :param str src: source string (or QGrams/Counter objects) for comparison
1262
    :param str tar: target string (or QGrams/Counter objects) for comparison
1263
    :param int qval: the length of each q-gram; 0 for non-q-gram version
1264
    :returns: cosine distance
1265
    :rtype: float
1266
1267
    >>> dist_cosine('cat', 'hat')
1268
    0.5
1269
    >>> dist_cosine('Niall', 'Neil')
1270
    0.6348516283298893
1271
    >>> dist_cosine('aluminum', 'Catalan')
1272
    0.882148869802242
1273
    >>> dist_cosine('ATCG', 'TAGC')
1274
    1.0
1275
    """
1276
    return 1 - sim_cosine(src, tar, qval)
1277
1278
1279
def sim_strcmp95(src, tar, long_strings=False):
1280
    """Return the strcmp95 similarity of two strings.
1281
1282
    This is a Python translation of the C code for strcmp95:
1283
    http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
1284
    :cite:`Winkler:1994`.
1285
    The above file is a US Government publication and, accordingly,
1286
    in the public domain.
1287
1288
    This is based on the Jaro-Winkler distance, but also attempts to correct
1289
    for some common typos and frequently confused characters. It is also
1290
    limited to uppercase ASCII characters, so it is appropriate to American
1291
    names, but not much else.
1292
1293
    :param str src: source string for comparison
1294
    :param str tar: target string for comparison
1295
    :param bool long_strings: set to True to "Increase the probability of a
1296
        match when the number of matched characters is large.  This option
1297
        allows for a little more tolerance when the strings are large. It is
1298
        not an appropriate test when comparing fixed length fields such as
1299
        phone and social security numbers."
1300
    :returns: strcmp95 similarity
1301
    :rtype: float
1302
1303
    >>> sim_strcmp95('cat', 'hat')
1304
    0.7777777777777777
1305
    >>> sim_strcmp95('Niall', 'Neil')
1306
    0.8454999999999999
1307
    >>> sim_strcmp95('aluminum', 'Catalan')
1308
    0.6547619047619048
1309
    >>> sim_strcmp95('ATCG', 'TAGC')
1310
    0.8333333333333334
1311
    """
1312
    def _in_range(char):
1313
        """Return True if char is in the range (0, 91)."""
1314
        return 91 > ord(char) > 0
1315
1316
    ying = src.strip().upper()
1317
    yang = tar.strip().upper()
1318
1319
    if ying == yang:
1320
        return 1.0
1321
    # If either string is blank - return - added in Version 2
1322
    if not ying or not yang:
1323
        return 0.0
1324
1325
    adjwt = defaultdict(int)
1326
    sp_mx = (
1327
        ('A', 'E'), ('A', 'I'), ('A', 'O'), ('A', 'U'), ('B', 'V'), ('E', 'I'),
1328
        ('E', 'O'), ('E', 'U'), ('I', 'O'), ('I', 'U'), ('O', 'U'), ('I', 'Y'),
1329
        ('E', 'Y'), ('C', 'G'), ('E', 'F'), ('W', 'U'), ('W', 'V'), ('X', 'K'),
1330
        ('S', 'Z'), ('X', 'S'), ('Q', 'C'), ('U', 'V'), ('M', 'N'), ('L', 'I'),
1331
        ('Q', 'O'), ('P', 'R'), ('I', 'J'), ('2', 'Z'), ('5', 'S'), ('8', 'B'),
1332
        ('1', 'I'), ('1', 'L'), ('0', 'O'), ('0', 'Q'), ('C', 'K'), ('G', 'J')
1333
    )
1334
1335
    # Initialize the adjwt array on the first call to the function only.
1336
    # The adjwt array is used to give partial credit for characters that
1337
    # may be errors due to known phonetic or character recognition errors.
1338
    # A typical example is to match the letter "O" with the number "0"
1339
    for i in sp_mx:
1340
        adjwt[(i[0], i[1])] = 3
1341
        adjwt[(i[1], i[0])] = 3
1342
1343
    if len(ying) > len(yang):
1344
        search_range = len(ying)
1345
        minv = len(yang)
1346
    else:
1347
        search_range = len(yang)
1348
        minv = len(ying)
1349
1350
    # Blank out the flags
1351
    ying_flag = [0] * search_range
1352
    yang_flag = [0] * search_range
1353
    search_range = max(0, search_range // 2 - 1)
1354
1355
    # Looking only within the search range, count and flag the matched pairs.
1356
    num_com = 0
1357
    yl1 = len(yang) - 1
1358
    for i in range(len(ying)):
1359
        low_lim = (i - search_range) if (i >= search_range) else 0
1360
        hi_lim = (i + search_range) if ((i + search_range) <= yl1) else yl1
1361
        for j in range(low_lim, hi_lim+1):
1362
            if (yang_flag[j] == 0) and (yang[j] == ying[i]):
1363
                yang_flag[j] = 1
1364
                ying_flag[i] = 1
1365
                num_com += 1
1366
                break
1367
1368
    # If no characters in common - return
1369
    if num_com == 0:
1370
        return 0.0
1371
1372
    # Count the number of transpositions
1373
    k = n_trans = 0
1374
    for i in range(len(ying)):
1375
        if ying_flag[i] != 0:
1376
            j = 0
1377
            for j in range(k, len(yang)):  # pragma: no branch
1378
                if yang_flag[j] != 0:
1379
                    k = j + 1
1380
                    break
1381
            if ying[i] != yang[j]:
1382
                n_trans += 1
1383
    n_trans //= 2
1384
1385
    # Adjust for similarities in unmatched characters
1386
    n_simi = 0
1387
    if minv > num_com:
1388
        for i in range(len(ying)):
1389
            if ying_flag[i] == 0 and _in_range(ying[i]):
1390
                for j in range(len(yang)):
1391
                    if yang_flag[j] == 0 and _in_range(yang[j]):
1392
                        if (ying[i], yang[j]) in adjwt:
1393
                            n_simi += adjwt[(ying[i], yang[j])]
1394
                            yang_flag[j] = 2
1395
                            break
1396
    num_sim = n_simi/10.0 + num_com
1397
1398
    # Main weight computation
1399
    weight = num_sim / len(ying) + num_sim / len(yang) + \
1400
        (num_com - n_trans) / num_com
1401
    weight /= 3.0
1402
1403
    # Continue to boost the weight if the strings are similar
1404
    if weight > 0.7:
1405
1406
        # Adjust for having up to the first 4 characters in common
1407
        j = 4 if (minv >= 4) else minv
1408
        i = 0
1409
        while (i < j) and (ying[i] == yang[i]) and (not ying[i].isdigit()):
1410
            i += 1
1411
        if i:
1412
            weight += i * 0.1 * (1.0 - weight)
1413
1414
        # Optionally adjust for long strings.
1415
1416
        # After agreeing beginning chars, at least two more must agree and
1417
        # the agreeing characters must be > .5 of remaining characters.
1418
        if (long_strings and (minv > 4) and (num_com > i+1) and
1419
                (2*num_com >= minv+i)):
1420
            if not ying[0].isdigit():
1421
                weight += (1.0-weight) * ((num_com-i-1) /
1422
                                          (len(ying)+len(yang)-i*2+2))
1423
1424
    return weight
1425
1426
1427
def dist_strcmp95(src, tar, long_strings=False):
1428
    """Return the strcmp95 distance between two strings.
1429
1430
    strcmp95 distance is the complement of strcmp95 similarity:
1431
    :math:`dist_{strcmp95} = 1 - sim_{strcmp95}`.
1432
1433
    :param str src: source string for comparison
1434
    :param str tar: target string for comparison
1435
    :param bool long_strings: set to True to "Increase the probability of a
1436
        match when the number of matched characters is large.  This option
1437
        allows for a little more tolerance when the strings are large. It is
1438
        not an appropriate test when comparing fixed length fields such as
1439
        phone and social security numbers."
1440
    :returns: strcmp95 distance
1441
    :rtype: float
1442
1443
    >>> round(dist_strcmp95('cat', 'hat'), 12)
1444
    0.222222222222
1445
    >>> round(dist_strcmp95('Niall', 'Neil'), 12)
1446
    0.1545
1447
    >>> round(dist_strcmp95('aluminum', 'Catalan'), 12)
1448
    0.345238095238
1449
    >>> round(dist_strcmp95('ATCG', 'TAGC'), 12)
1450
    0.166666666667
1451
    """
1452
    return 1 - sim_strcmp95(src, tar, long_strings)
1453
1454
1455
def sim_jaro_winkler(src, tar, qval=1, mode='winkler', long_strings=False,
1456
                     boost_threshold=0.7, scaling_factor=0.1):
1457
    """Return the Jaro or Jaro-Winkler similarity of two strings.
1458
1459
    Jaro(-Winkler) distance is a string edit distance initially proposed by
1460
    Jaro and extended by Winkler :cite:`Jaro:1989,Winkler:1990`.
1461
1462
    This is Python based on the C code for strcmp95:
1463
    http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
1464
    :cite:`Winkler:1994`. The above file is a US Government publication and,
1465
    accordingly, in the public domain.
1466
1467
    :param str src: source string for comparison
1468
    :param str tar: target string for comparison
1469
    :param int qval: the length of each q-gram (defaults to 1: character-wise
1470
        matching)
1471
    :param str mode: indicates which variant of this distance metric to
1472
        compute:
1473
1474
            - 'winkler' -- computes the Jaro-Winkler distance (default) which
1475
              increases the score for matches near the start of the word
1476
            - 'jaro' -- computes the Jaro distance
1477
1478
    The following arguments apply only when mode is 'winkler':
1479
1480
    :param bool long_strings: set to True to "Increase the probability of a
1481
        match when the number of matched characters is large.  This option
1482
        allows for a little more tolerance when the strings are large.  It is
1483
        not an appropriate test when comparing fixed length fields such as
1484
        phone and social security numbers."
1485
    :param float boost_threshold: a value between 0 and 1, below which the
1486
        Winkler boost is not applied (defaults to 0.7)
1487
    :param float scaling_factor: a value between 0 and 0.25, indicating by how
1488
        much to boost scores for matching prefixes (defaults to 0.1)
1489
1490
    :returns: Jaro or Jaro-Winkler similarity
1491
    :rtype: float
1492
1493
    >>> round(sim_jaro_winkler('cat', 'hat'), 12)
1494
    0.777777777778
1495
    >>> round(sim_jaro_winkler('Niall', 'Neil'), 12)
1496
    0.805
1497
    >>> round(sim_jaro_winkler('aluminum', 'Catalan'), 12)
1498
    0.60119047619
1499
    >>> round(sim_jaro_winkler('ATCG', 'TAGC'), 12)
1500
    0.833333333333
1501
1502
    >>> round(sim_jaro_winkler('cat', 'hat', mode='jaro'), 12)
1503
    0.777777777778
1504
    >>> round(sim_jaro_winkler('Niall', 'Neil', mode='jaro'), 12)
1505
    0.783333333333
1506
    >>> round(sim_jaro_winkler('aluminum', 'Catalan', mode='jaro'), 12)
1507
    0.60119047619
1508
    >>> round(sim_jaro_winkler('ATCG', 'TAGC', mode='jaro'), 12)
1509
    0.833333333333
1510
    """
1511
    if mode == 'winkler':
1512
        if boost_threshold > 1 or boost_threshold < 0:
1513
            raise ValueError('Unsupported boost_threshold assignment; ' +
1514
                             'boost_threshold must be between 0 and 1.')
1515
        if scaling_factor > 0.25 or scaling_factor < 0:
1516
            raise ValueError('Unsupported scaling_factor assignment; ' +
1517
                             'scaling_factor must be between 0 and 0.25.')
1518
1519
    if src == tar:
1520
        return 1.0
1521
1522
    src = QGrams(src.strip(), qval).ordered_list
1523
    tar = QGrams(tar.strip(), qval).ordered_list
1524
1525
    lens = len(src)
1526
    lent = len(tar)
1527
1528
    # If either string is blank - return - added in Version 2
1529
    if lens == 0 or lent == 0:
1530
        return 0.0
1531
1532
    if lens > lent:
1533
        search_range = lens
1534
        minv = lent
1535
    else:
1536
        search_range = lent
1537
        minv = lens
1538
1539
    # Zero out the flags
1540
    src_flag = [0] * search_range
1541
    tar_flag = [0] * search_range
1542
    search_range = max(0, search_range//2 - 1)
1543
1544
    # Looking only within the search range, count and flag the matched pairs.
1545
    num_com = 0
1546
    yl1 = lent - 1
1547
    for i in range(lens):
1548
        low_lim = (i - search_range) if (i >= search_range) else 0
1549
        hi_lim = (i + search_range) if ((i + search_range) <= yl1) else yl1
1550
        for j in range(low_lim, hi_lim+1):
1551
            if (tar_flag[j] == 0) and (tar[j] == src[i]):
1552
                tar_flag[j] = 1
1553
                src_flag[i] = 1
1554
                num_com += 1
1555
                break
1556
1557
    # If no characters in common - return
1558
    if num_com == 0:
1559
        return 0.0
1560
1561
    # Count the number of transpositions
1562
    k = n_trans = 0
1563
    for i in range(lens):
1564
        if src_flag[i] != 0:
1565
            j = 0
1566
            for j in range(k, lent):  # pragma: no branch
1567
                if tar_flag[j] != 0:
1568
                    k = j + 1
1569
                    break
1570
            if src[i] != tar[j]:
1571
                n_trans += 1
1572
    n_trans //= 2
1573
1574
    # Main weight computation for Jaro distance
1575
    weight = num_com / lens + num_com / lent + (num_com - n_trans) / num_com
1576
    weight /= 3.0
1577
1578
    # Continue to boost the weight if the strings are similar
1579
    # This is the Winkler portion of Jaro-Winkler distance
1580
    if mode == 'winkler' and weight > boost_threshold:
1581
1582
        # Adjust for having up to the first 4 characters in common
1583
        j = 4 if (minv >= 4) else minv
1584
        i = 0
1585
        while (i < j) and (src[i] == tar[i]):
1586
            i += 1
1587
        weight += i * scaling_factor * (1.0 - weight)
1588
1589
        # Optionally adjust for long strings.
1590
1591
        # After agreeing beginning chars, at least two more must agree and
1592
        # the agreeing characters must be > .5 of remaining characters.
1593
        if (long_strings and (minv > 4) and (num_com > i+1) and
1594
                (2*num_com >= minv+i)):
1595
            weight += (1.0-weight) * ((num_com-i-1) / (lens+lent-i*2+2))
1596
1597
    return weight
1598
1599
1600
def dist_jaro_winkler(src, tar, qval=1, mode='winkler', long_strings=False,
1601
                      boost_threshold=0.7, scaling_factor=0.1):
1602
    """Return the Jaro or Jaro-Winkler distance between two strings.
1603
1604
    Jaro(-Winkler) similarity is the complement of Jaro(-Winkler) distance:
1605
    :math:`sim_{Jaro(-Winkler)} = 1 - dist_{Jaro(-Winkler)}`.
1606
1607
    :param str src: source string for comparison
1608
    :param str tar: target string for comparison
1609
    :param int qval: the length of each q-gram (defaults to 1: character-wise
1610
        matching)
1611
    :param str mode: indicates which variant of this distance metric to
1612
        compute:
1613
1614
            - 'winkler' -- computes the Jaro-Winkler distance (default) which
1615
              increases the score for matches near the start of the word
1616
            - 'jaro' -- computes the Jaro distance
1617
1618
    The following arguments apply only when mode is 'winkler':
1619
1620
    :param bool long_strings: set to True to "Increase the probability of a
1621
        match when the number of matched characters is large.  This option
1622
        allows for a little more tolerance when the strings are large.  It is
1623
        not an appropriate test when comparing fixed length fields such as
1624
        phone and social security numbers."
1625
    :param float boost_threshold: a value between 0 and 1, below which the
1626
        Winkler boost is not applied (defaults to 0.7)
1627
    :param float scaling_factor: a value between 0 and 0.25, indicating by how
1628
        much to boost scores for matching prefixes (defaults to 0.1)
1629
1630
    :returns: Jaro or Jaro-Winkler distance
1631
    :rtype: float
1632
1633
    >>> round(dist_jaro_winkler('cat', 'hat'), 12)
1634
    0.222222222222
1635
    >>> round(dist_jaro_winkler('Niall', 'Neil'), 12)
1636
    0.195
1637
    >>> round(dist_jaro_winkler('aluminum', 'Catalan'), 12)
1638
    0.39880952381
1639
    >>> round(dist_jaro_winkler('ATCG', 'TAGC'), 12)
1640
    0.166666666667
1641
1642
    >>> round(dist_jaro_winkler('cat', 'hat', mode='jaro'), 12)
1643
    0.222222222222
1644
    >>> round(dist_jaro_winkler('Niall', 'Neil', mode='jaro'), 12)
1645
    0.216666666667
1646
    >>> round(dist_jaro_winkler('aluminum', 'Catalan', mode='jaro'), 12)
1647
    0.39880952381
1648
    >>> round(dist_jaro_winkler('ATCG', 'TAGC', mode='jaro'), 12)
1649
    0.166666666667
1650
    """
1651
    return 1 - sim_jaro_winkler(src, tar, qval, mode, long_strings,
1652
                                boost_threshold, scaling_factor)
1653
1654
1655
def lcsseq(src, tar):
1656
    """Return the longest common subsequence of two strings.
1657
1658
    Longest common subsequence (LCSseq) is the longest subsequence of
1659
    characters that two strings have in common.
1660
1661
    Based on the dynamic programming algorithm from
1662
    http://rosettacode.org/wiki/Longest_common_subsequence#Dynamic_Programming_6
1663
    :cite:`rosettacode:2018b`. This is licensed GFDL 1.2.
1664
1665
    Modifications include:
1666
        conversion to a numpy array in place of a list of lists
1667
1668
    :param str src: source string for comparison
1669
    :param str tar: target string for comparison
1670
    :returns: the longest common subsequence
1671
    :rtype: str
1672
1673
    >>> lcsseq('cat', 'hat')
1674
    'at'
1675
    >>> lcsseq('Niall', 'Neil')
1676
    'Nil'
1677
    >>> lcsseq('aluminum', 'Catalan')
1678
    'aln'
1679
    >>> lcsseq('ATCG', 'TAGC')
1680
    'AC'
1681
    """
1682
    lengths = np_zeros((len(src)+1, len(tar)+1), dtype=np_int)
1683
1684
    # row 0 and column 0 are initialized to 0 already
1685
    for i, src_char in enumerate(src):
1686
        for j, tar_char in enumerate(tar):
1687
            if src_char == tar_char:
1688
                lengths[i+1, j+1] = lengths[i, j] + 1
1689
            else:
1690
                lengths[i+1, j+1] = max(lengths[i+1, j], lengths[i, j+1])
1691
1692
    # read the substring out from the matrix
1693
    result = ''
1694
    i, j = len(src), len(tar)
1695
    while i != 0 and j != 0:
1696
        if lengths[i, j] == lengths[i-1, j]:
1697
            i -= 1
1698
        elif lengths[i, j] == lengths[i, j-1]:
1699
            j -= 1
1700
        else:
1701
            result = src[i-1] + result
1702
            i -= 1
1703
            j -= 1
1704
    return result
1705
1706
1707
def sim_lcsseq(src, tar):
1708
    r"""Return the longest common subsequence similarity of two strings.
1709
1710
    Longest common subsequence similarity (:math:`sim_{LCSseq}`).
1711
1712
    This employs the LCSseq function to derive a similarity metric:
1713
    :math:`sim_{LCSseq}(s,t) = \\frac{|LCSseq(s,t)|}{max(|s|, |t|)}`
1714
1715
    :param str src: source string for comparison
1716
    :param str tar: target string for comparison
1717
    :returns: LCSseq similarity
1718
    :rtype: float
1719
1720
    >>> sim_lcsseq('cat', 'hat')
1721
    0.6666666666666666
1722
    >>> sim_lcsseq('Niall', 'Neil')
1723
    0.6
1724
    >>> sim_lcsseq('aluminum', 'Catalan')
1725
    0.375
1726
    >>> sim_lcsseq('ATCG', 'TAGC')
1727
    0.5
1728
    """
1729
    if src == tar:
1730
        return 1.0
1731
    elif not src or not tar:
1732
        return 0.0
1733
    return len(lcsseq(src, tar)) / max(len(src), len(tar))
1734
1735
1736
def dist_lcsseq(src, tar):
1737
    """Return the longest common subsequence distance between two strings.
1738
1739
    Longest common subsequence distance (:math:`dist_{LCSseq}`).
1740
1741
    This employs the LCSseq function to derive a similarity metric:
1742
    :math:`dist_{LCSseq}(s,t) = 1 - sim_{LCSseq}(s,t)`
1743
1744
    :param str src: source string for comparison
1745
    :param str tar: target string for comparison
1746
    :returns: LCSseq distance
1747
    :rtype: float
1748
1749
    >>> dist_lcsseq('cat', 'hat')
1750
    0.33333333333333337
1751
    >>> dist_lcsseq('Niall', 'Neil')
1752
    0.4
1753
    >>> dist_lcsseq('aluminum', 'Catalan')
1754
    0.625
1755
    >>> dist_lcsseq('ATCG', 'TAGC')
1756
    0.5
1757
    """
1758
    return 1 - sim_lcsseq(src, tar)
1759
1760
1761
def lcsstr(src, tar):
1762
    """Return the longest common substring of two strings.
1763
1764
    Longest common substring (LCSstr).
1765
1766
    Based on the code from
1767
    https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring#Python
1768
    :cite:`Wikibooks:2018`.
1769
    This is licensed Creative Commons: Attribution-ShareAlike 3.0.
1770
1771
    Modifications include:
1772
1773
        - conversion to a numpy array in place of a list of lists
1774
        - conversion to Python 2/3-safe range from xrange via six
1775
1776
    :param str src: source string for comparison
1777
    :param str tar: target string for comparison
1778
    :returns: the longest common substring
1779
    :rtype: str
1780
1781
    >>> lcsstr('cat', 'hat')
1782
    'at'
1783
    >>> lcsstr('Niall', 'Neil')
1784
    'N'
1785
    >>> lcsstr('aluminum', 'Catalan')
1786
    'al'
1787
    >>> lcsstr('ATCG', 'TAGC')
1788
    'A'
1789
    """
1790
    lengths = np_zeros((len(src)+1, len(tar)+1), dtype=np_int)
1791
    longest, i_longest = 0, 0
1792
    for i in range(1, len(src)+1):
1793
        for j in range(1, len(tar)+1):
1794
            if src[i-1] == tar[j-1]:
1795
                lengths[i, j] = lengths[i-1, j-1] + 1
1796
                if lengths[i, j] > longest:
1797
                    longest = lengths[i, j]
1798
                    i_longest = i
1799
            else:
1800
                lengths[i, j] = 0
1801
    return src[i_longest - longest:i_longest]
1802
1803
1804
def sim_lcsstr(src, tar):
1805
    r"""Return the longest common substring similarity of two strings.
1806
1807
    Longest common substring similarity (:math:`sim_{LCSstr}`).
1808
1809
    This employs the LCS function to derive a similarity metric:
1810
    :math:`sim_{LCSstr}(s,t) = \\frac{|LCSstr(s,t)|}{max(|s|, |t|)}`
1811
1812
    :param str src: source string for comparison
1813
    :param str tar: target string for comparison
1814
    :returns: LCSstr similarity
1815
    :rtype: float
1816
1817
    >>> sim_lcsstr('cat', 'hat')
1818
    0.6666666666666666
1819
    >>> sim_lcsstr('Niall', 'Neil')
1820
    0.2
1821
    >>> sim_lcsstr('aluminum', 'Catalan')
1822
    0.25
1823
    >>> sim_lcsstr('ATCG', 'TAGC')
1824
    0.25
1825
    """
1826
    if src == tar:
1827
        return 1.0
1828
    elif not src or not tar:
1829
        return 0.0
1830
    return len(lcsstr(src, tar)) / max(len(src), len(tar))
1831
1832
1833
def dist_lcsstr(src, tar):
1834
    """Return the longest common substring distance between two strings.
1835
1836
    Longest common substring distance (:math:`dist_{LCSstr}`).
1837
1838
    This employs the LCS function to derive a similarity metric:
1839
    :math:`dist_{LCSstr}(s,t) = 1 - sim_{LCSstr}(s,t)`
1840
1841
    :param str src: source string for comparison
1842
    :param str tar: target string for comparison
1843
    :returns: LCSstr distance
1844
    :rtype: float
1845
1846
    >>> dist_lcsstr('cat', 'hat')
1847
    0.33333333333333337
1848
    >>> dist_lcsstr('Niall', 'Neil')
1849
    0.8
1850
    >>> dist_lcsstr('aluminum', 'Catalan')
1851
    0.75
1852
    >>> dist_lcsstr('ATCG', 'TAGC')
1853
    0.75
1854
    """
1855
    return 1 - sim_lcsstr(src, tar)
1856
1857
1858
def sim_ratcliff_obershelp(src, tar):
1859
    """Return the Ratcliff-Obershelp similarity of two strings.
1860
1861
    This follows the Ratcliff-Obershelp algorithm :cite:`Ratcliff:1988` to
1862
    derive a similarity measure:
1863
1864
        1. Find the length of the longest common substring in src & tar.
1865
        2. Recurse on the strings to the left & right of each this substring
1866
           in src & tar. The base case is a 0 length common substring, in which
1867
           case, return 0. Otherwise, return the sum of the current longest
1868
           common substring and the left & right recursed sums.
1869
        3. Multiply this length by 2 and divide by the sum of the lengths of
1870
           src & tar.
1871
1872
    Cf.
1873
    http://www.drdobbs.com/database/pattern-matching-the-gestalt-approach/184407970
1874
1875
    :param str src: source string for comparison
1876
    :param str tar: target string for comparison
1877
    :returns: Ratcliff-Obershelp similarity
1878
    :rtype: float
1879
1880
    >>> round(sim_ratcliff_obershelp('cat', 'hat'), 12)
1881
    0.666666666667
1882
    >>> round(sim_ratcliff_obershelp('Niall', 'Neil'), 12)
1883
    0.666666666667
1884
    >>> round(sim_ratcliff_obershelp('aluminum', 'Catalan'), 12)
1885
    0.4
1886
    >>> sim_ratcliff_obershelp('ATCG', 'TAGC')
1887
    0.5
1888
    """
1889
    def _lcsstr_stl(src, tar):
1890
        """Return start positions & length for Ratcliff-Obershelp.
1891
1892
        Return the start position in the source string, start position in
1893
        the target string, and length of the longest common substring of
1894
        strings src and tar.
1895
        """
1896
        lengths = np_zeros((len(src)+1, len(tar)+1), dtype=np_int)
1897
        longest, src_longest, tar_longest = 0, 0, 0
1898
        for i in range(1, len(src)+1):
1899
            for j in range(1, len(tar)+1):
1900
                if src[i-1] == tar[j-1]:
1901
                    lengths[i, j] = lengths[i-1, j-1] + 1
1902
                    if lengths[i, j] > longest:
1903
                        longest = lengths[i, j]
1904
                        src_longest = i
1905
                        tar_longest = j
1906
                else:
1907
                    lengths[i, j] = 0
1908
        return src_longest-longest, tar_longest-longest, longest
1909
1910
    def _sstr_matches(src, tar):
1911
        """Return the sum of substring match lengths.
1912
1913
        This follows the Ratcliff-Obershelp algorithm :cite:`Ratcliff:1988`:
1914
             1. Find the length of the longest common substring in src & tar.
1915
             2. Recurse on the strings to the left & right of each this
1916
                 substring in src & tar.
1917
             3. Base case is a 0 length common substring, in which case,
1918
                 return 0.
1919
             4. Return the sum.
1920
        """
1921
        src_start, tar_start, length = _lcsstr_stl(src, tar)
1922
        if length == 0:
1923
            return 0
1924
        return (_sstr_matches(src[:src_start], tar[:tar_start]) +
1925
                length +
1926
                _sstr_matches(src[src_start+length:],
1927
                              tar[tar_start+length:]))
1928
1929
    if src == tar:
1930
        return 1.0
1931
    elif not src or not tar:
1932
        return 0.0
1933
    return 2*_sstr_matches(src, tar)/(len(src)+len(tar))
1934
1935
1936
def dist_ratcliff_obershelp(src, tar):
1937
    """Return the Ratcliff-Obershelp distance between two strings.
1938
1939
    Ratcliff-Obsershelp distance the complement of Ratcliff-Obershelp
1940
    similarity:
1941
    :math:`dist_{Ratcliff-Obershelp} = 1 - sim_{Ratcliff-Obershelp}`.
1942
1943
    :param str src: source string for comparison
1944
    :param str tar: target string for comparison
1945
    :returns: Ratcliff-Obershelp distance
1946
    :rtype: float
1947
1948
    >>> round(dist_ratcliff_obershelp('cat', 'hat'), 12)
1949
    0.333333333333
1950
    >>> round(dist_ratcliff_obershelp('Niall', 'Neil'), 12)
1951
    0.333333333333
1952
    >>> round(dist_ratcliff_obershelp('aluminum', 'Catalan'), 12)
1953
    0.6
1954
    >>> dist_ratcliff_obershelp('ATCG', 'TAGC')
1955
    0.5
1956
    """
1957
    return 1 - sim_ratcliff_obershelp(src, tar)
1958
1959
1960
def mra_compare(src, tar):
1961
    """Return the MRA comparison rating of two strings.
1962
1963
    The Western Airlines Surname Match Rating Algorithm comparison rating, as
1964
    presented on page 18 of :cite:`Moore:1977`.
1965
1966
    :param str src: source string for comparison
1967
    :param str tar: target string for comparison
1968
    :returns: MRA comparison rating
1969
    :rtype: int
1970
1971
    >>> mra_compare('cat', 'hat')
1972
    5
1973
    >>> mra_compare('Niall', 'Neil')
1974
    6
1975
    >>> mra_compare('aluminum', 'Catalan')
1976
    0
1977
    >>> mra_compare('ATCG', 'TAGC')
1978
    5
1979
    """
1980
    if src == tar:
1981
        return 6
1982
    if src == '' or tar == '':
1983
        return 0
1984
    src = list(mra(src))
1985
    tar = list(mra(tar))
1986
1987
    if abs(len(src)-len(tar)) > 2:
1988
        return 0
1989
1990
    length_sum = len(src) + len(tar)
1991
    if length_sum < 5:
1992
        min_rating = 5
1993
    elif length_sum < 8:
1994
        min_rating = 4
1995
    elif length_sum < 12:
1996
        min_rating = 3
1997
    else:
1998
        min_rating = 2
1999
2000
    for _ in range(2):
2001
        new_src = []
2002
        new_tar = []
2003
        minlen = min(len(src), len(tar))
2004
        for i in range(minlen):
2005
            if src[i] != tar[i]:
2006
                new_src.append(src[i])
2007
                new_tar.append(tar[i])
2008
        src = new_src+src[minlen:]
2009
        tar = new_tar+tar[minlen:]
2010
        src.reverse()
2011
        tar.reverse()
2012
2013
    similarity = 6 - max(len(src), len(tar))
2014
2015
    if similarity >= min_rating:
2016
        return similarity
2017
    return 0
2018
2019
2020
def sim_mra(src, tar):
2021
    """Return the normalized MRA similarity of two strings.
2022
2023
    This is the MRA normalized to :math:`[0, 1]`, given that MRA itself is
2024
    constrained to the range :math:`[0, 6]`.
2025
2026
    :param str src: source string for comparison
2027
    :param str tar: target string for comparison
2028
    :returns: normalized MRA similarity
2029
    :rtype: float
2030
2031
    >>> sim_mra('cat', 'hat')
2032
    0.8333333333333334
2033
    >>> sim_mra('Niall', 'Neil')
2034
    1.0
2035
    >>> sim_mra('aluminum', 'Catalan')
2036
    0.0
2037
    >>> sim_mra('ATCG', 'TAGC')
2038
    0.8333333333333334
2039
    """
2040
    return mra_compare(src, tar)/6
2041
2042
2043
def dist_mra(src, tar):
2044
    """Return the normalized MRA distance between two strings.
2045
2046
    MRA distance is the complement of MRA similarity:
2047
    :math:`dist_{MRA} = 1 - sim_{MRA}`.
2048
2049
    :param str src: source string for comparison
2050
    :param str tar: target string for comparison
2051
    :returns: normalized MRA distance
2052
    :rtype: float
2053
2054
    >>> dist_mra('cat', 'hat')
2055
    0.16666666666666663
2056
    >>> dist_mra('Niall', 'Neil')
2057
    0.0
2058
    >>> dist_mra('aluminum', 'Catalan')
2059
    1.0
2060
    >>> dist_mra('ATCG', 'TAGC')
2061
    0.16666666666666663
2062
    """
2063
    return 1 - sim_mra(src, tar)
2064
2065
2066
def dist_compression(src, tar, compressor='bz2', probs=None):
2067
    """Return the normalized compression distance between two strings.
2068
2069
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
2070
2071
    :param str src: source string for comparison
2072
    :param str tar: target string for comparison
2073
    :param str compressor: a compression scheme to use for the similarity
2074
        calculation, from the following:
2075
2076
            - `zlib` -- standard zlib/gzip
2077
            - `bz2` -- bzip2 (default)
2078
            - `lzma` -- Lempel–Ziv–Markov chain algorithm
2079
            - `arith` -- arithmetic coding
2080
            - `rle` -- run-length encoding
2081
            - `bwtrle` -- Burrows-Wheeler transform followed by run-length
2082
              encoding
2083
2084
    :param dict probs: a dictionary trained with ac_train (for the arith
2085
        compressor only)
2086
    :returns: compression distance
2087
    :rtype: float
2088
2089
    >>> dist_compression('cat', 'hat')
2090
    0.08
2091
    >>> dist_compression('Niall', 'Neil')
2092
    0.037037037037037035
2093
    >>> dist_compression('aluminum', 'Catalan')
2094
    0.20689655172413793
2095
    >>> dist_compression('ATCG', 'TAGC')
2096
    0.037037037037037035
2097
2098
    >>> dist_compression('Niall', 'Neil', compressor='zlib')
2099
    0.45454545454545453
2100
    >>> dist_compression('Niall', 'Neil', compressor='bz2')
2101
    0.037037037037037035
2102
    >>> dist_compression('Niall', 'Neil', compressor='lzma')
2103
    0.16
2104
    >>> dist_compression('Niall', 'Neil', compressor='arith')
2105
    0.6875
2106
    >>> dist_compression('Niall', 'Neil', compressor='rle')
2107
    1.0
2108
    >>> dist_compression('Niall', 'Neil', compressor='bwtrle')
2109
    0.8333333333333334
2110
    """
2111
    if src == tar:
2112
        return 0.0
2113
2114
    if compressor not in {'arith', 'rle', 'bwtrle'}:
2115
        src = src.encode('utf-8')
2116
        tar = tar.encode('utf-8')
2117
2118
    if compressor == 'bz2':
2119
        src_comp = encode(src, 'bz2_codec')[15:]
2120
        tar_comp = encode(tar, 'bz2_codec')[15:]
2121
        concat_comp = encode(src+tar, 'bz2_codec')[15:]
2122
        concat_comp2 = encode(tar+src, 'bz2_codec')[15:]
2123
    elif compressor == 'lzma':
2124
        if 'lzma' in modules:
2125
            src_comp = lzma.compress(src)[14:]
2126
            tar_comp = lzma.compress(tar)[14:]
2127
            concat_comp = lzma.compress(src+tar)[14:]
2128
            concat_comp2 = lzma.compress(tar+src)[14:]
2129
        else:
2130
            raise ValueError('Install the PylibLZMA module in order to use ' +
2131
                             'lzma compression similarity')
2132
    elif compressor == 'arith':
2133
        if probs is None:
2134
            # lacking a reasonable dictionary, train on the strings themselves
2135
            probs = ac_train(src+tar)
2136
        src_comp = ac_encode(src, probs)[1]
2137
        tar_comp = ac_encode(tar, probs)[1]
2138
        concat_comp = ac_encode(src+tar, probs)[1]
2139
        concat_comp2 = ac_encode(tar+src, probs)[1]
2140
        return ((min(concat_comp, concat_comp2) - min(src_comp, tar_comp)) /
2141
                max(src_comp, tar_comp))
2142
    elif compressor in {'rle', 'bwtrle'}:
2143
        src_comp = rle_encode(src, (compressor == 'bwtrle'))
2144
        tar_comp = rle_encode(tar, (compressor == 'bwtrle'))
2145
        concat_comp = rle_encode(src+tar, (compressor == 'bwtrle'))
2146
        concat_comp2 = rle_encode(tar+src, (compressor == 'bwtrle'))
2147
    else:  # zlib
2148
        src_comp = encode(src, 'zlib_codec')[2:]
2149
        tar_comp = encode(tar, 'zlib_codec')[2:]
2150
        concat_comp = encode(src+tar, 'zlib_codec')[2:]
2151
        concat_comp2 = encode(tar+src, 'zlib_codec')[2:]
2152
    return ((min(len(concat_comp), len(concat_comp2)) -
2153
             min(len(src_comp), len(tar_comp))) /
2154
            max(len(src_comp), len(tar_comp)))
2155
2156
2157
def sim_compression(src, tar, compressor='bz2', probs=None):
2158
    """Return the normalized compression similarity of two strings.
2159
2160
    Normalized compression similarity is the complement of normalized
2161
    compression distance:
2162
    :math:`sim_{NCS} = 1 - dist_{NCD}`.
2163
2164
    :param str src: source string for comparison
2165
    :param str tar: target string for comparison
2166
    :param str compressor: a compression scheme to use for the similarity
2167
        calculation:
2168
2169
            - `zlib` -- standard zlib/gzip
2170
            - `bz2` -- bzip2 (default)
2171
            - `lzma` -- Lempel–Ziv–Markov chain algorithm
2172
            - `arith` -- arithmetic coding
2173
            - `rle` -- run-length encoding
2174
            - `bwtrle` -- Burrows-Wheeler transform followed by run-length
2175
              encoding
2176
2177
    :param dict probs: a dictionary trained with ac_train (for the arith
2178
        compressor only)
2179
    :returns: compression similarity
2180
    :rtype: float
2181
2182
    >>> sim_compression('cat', 'hat')
2183
    0.92
2184
    >>> sim_compression('Niall', 'Neil')
2185
    0.962962962962963
2186
    >>> sim_compression('aluminum', 'Catalan')
2187
    0.7931034482758621
2188
    >>> sim_compression('ATCG', 'TAGC')
2189
    0.962962962962963
2190
2191
    >>> sim_compression('Niall', 'Neil', compressor='zlib')
2192
    0.5454545454545454
2193
    >>> sim_compression('Niall', 'Neil', compressor='bz2')
2194
    0.962962962962963
2195
    >>> sim_compression('Niall', 'Neil', compressor='lzma')
2196
    0.84
2197
    >>> sim_compression('Niall', 'Neil', compressor='arith')
2198
    0.3125
2199
    >>> sim_compression('Niall', 'Neil', compressor='rle')
2200
    0.0
2201
    >>> sim_compression('Niall', 'Neil', compressor='bwtrle')
2202
    0.16666666666666663
2203
    """
2204
    return 1 - dist_compression(src, tar, compressor, probs)
2205
2206
2207
def sim_monge_elkan(src, tar, sim_func=sim_levenshtein, symmetric=False):
2208
    """Return the Monge-Elkan similarity of two strings.
2209
2210
    Monge-Elkan is defined in :cite:`Monge:1996`.
2211
2212
    Note: Monge-Elkan is NOT a symmetric similarity algoritm. Thus, the
2213
    similarity of src to tar is not necessarily equal to the similarity of
2214
    tar to src. If the sym argument is True, a symmetric value is calculated,
2215
    at the cost of doubling the computation time (since the
2216
    :math:`sim_{Monge-Elkan}(src, tar)` and
2217
    :math:`sim_{Monge-Elkan}(tar, src)` are both calculated and then averaged).
2218
2219
    :param str src: source string for comparison
2220
    :param str tar: target string for comparison
2221
    :param function sim_func: the internal similarity metric to employ
2222
    :param bool symmetric: return a symmetric similarity measure
2223
    :returns: Monge-Elkan similarity
2224
    :rtype: float
2225
2226
    >>> sim_monge_elkan('cat', 'hat')
2227
    0.75
2228
    >>> round(sim_monge_elkan('Niall', 'Neil'), 12)
2229
    0.666666666667
2230
    >>> round(sim_monge_elkan('aluminum', 'Catalan'), 12)
2231
    0.388888888889
2232
    >>> sim_monge_elkan('ATCG', 'TAGC')
2233
    0.5
2234
    """
2235
    if src == tar:
2236
        return 1.0
2237
2238
    q_src = sorted(QGrams(src).elements())
2239
    q_tar = sorted(QGrams(tar).elements())
2240
2241
    if not q_src or not q_tar:
2242
        return 0.0
2243
2244
    sum_of_maxes = 0
2245
    for q_s in q_src:
2246
        max_sim = float('-inf')
2247
        for q_t in q_tar:
2248
            max_sim = max(max_sim, sim_func(q_s, q_t))
2249
        sum_of_maxes += max_sim
2250
    sim_em = sum_of_maxes / len(q_src)
2251
2252
    if symmetric:
2253
        sim_em = (sim_em + sim_monge_elkan(tar, src, sim, False))/2
2254
2255
    return sim_em
2256
2257
2258
def dist_monge_elkan(src, tar, sim_func=sim_levenshtein, symmetric=False):
2259
    """Return the Monge-Elkan distance between two strings.
2260
2261
    Monge-Elkan distance is the complement of Monge-Elkan similarity:
2262
    :math:`dist_{Monge-Elkan} = 1 - sim_{Monge-Elkan}`.
2263
2264
    :param str src: source string for comparison
2265
    :param str tar: target string for comparison
2266
    :param function sim_func: the internal similarity metric to employ
2267
    :param bool symmetric: return a symmetric similarity measure
2268
    :returns: Monge-Elkan distance
2269
    :rtype: float
2270
2271
    >>> dist_monge_elkan('cat', 'hat')
2272
    0.25
2273
    >>> round(dist_monge_elkan('Niall', 'Neil'), 12)
2274
    0.333333333333
2275
    >>> round(dist_monge_elkan('aluminum', 'Catalan'), 12)
2276
    0.611111111111
2277
    >>> dist_monge_elkan('ATCG', 'TAGC')
2278
    0.5
2279
    """
2280
    return 1 - sim_monge_elkan(src, tar, sim_func, symmetric)
2281
2282
2283
def sim_ident(src, tar):
2284
    """Return the identity similarity of two strings.
2285
2286
    Identity similarity is 1 if the two strings are identical, otherwise 0.
2287
2288
    :param str src: source string for comparison
2289
    :param str tar: target string for comparison
2290
    :returns: identity similarity
2291
    :rtype: int
2292
2293
    >>> sim_ident('cat', 'hat')
2294
    0
2295
    >>> sim_ident('cat', 'cat')
2296
    1
2297
    """
2298
    return int(src == tar)
2299
2300
2301
def dist_ident(src, tar):
2302
    """Return the identity distance between two strings.
2303
2304
    This is 0 if the two strings are identical, otherwise 1, i.e.
2305
    :math:`dist_{identity} = 1 - sim_{identity}`.
2306
2307
    :param str src: source string for comparison
2308
    :param str tar: target string for comparison
2309
    :returns: identity distance
2310
    :rtype: int
2311
2312
    >>> dist_ident('cat', 'hat')
2313
    1
2314
    >>> dist_ident('cat', 'cat')
2315
    0
2316
    """
2317
    return 1 - sim_ident(src, tar)
2318
2319
2320
def sim_matrix(src, tar, mat=None, mismatch_cost=0, match_cost=1,
2321
               symmetric=True, alphabet=None):
2322
    """Return the matrix similarity of two strings.
2323
2324
    With the default parameters, this is identical to sim_ident.
2325
    It is possible for sim_matrix to return values outside of the range
2326
    :math:`[0, 1]`, if values outside that range are present in mat,
2327
    mismatch_cost, or match_cost.
2328
2329
    :param str src: source string for comparison
2330
    :param str tar: target string for comparison
2331
    :param dict mat: a dict mapping tuples to costs; the tuples are (src, tar)
2332
        pairs of symbols from the alphabet parameter
2333
    :param float mismatch_cost: the value returned if (src, tar) is absent from
2334
        mat when src does not equal tar
2335
    :param float match_cost: the value returned if (src, tar) is absent from
2336
        mat when src equals tar
2337
    :param bool symmetric: True if the cost of src not matching tar is
2338
        identical to the cost of tar not matching src; in this case, the values
2339
        in mat need only contain (src, tar) or (tar, src), not both
2340
    :param str alphabet: a collection of tokens from which src and tar are
2341
        drawn; if this is defined a ValueError is raised if either tar or src
2342
        is not found in alphabet
2343
    :returns: matrix similarity
2344
    :rtype: float
2345
2346
    >>> sim_matrix('cat', 'hat')
2347
    0
2348
    >>> sim_matrix('hat', 'hat')
2349
    1
2350
    """
2351
    if alphabet:
2352
        alphabet = tuple(alphabet)
2353
        for i in src:
2354
            if i not in alphabet:
2355
                raise ValueError('src value not in alphabet')
2356
        for i in tar:
2357
            if i not in alphabet:
2358
                raise ValueError('tar value not in alphabet')
2359
2360
    if src == tar:
2361
        if mat and (src, src) in mat:
2362
            return mat[(src, src)]
2363
        return match_cost
2364
    if mat and (src, tar) in mat:
2365
        return mat[(src, tar)]
2366
    elif symmetric and mat and (tar, src) in mat:
2367
        return mat[(tar, src)]
2368
    return mismatch_cost
2369
2370
2371 View Code Duplication
def needleman_wunsch(src, tar, gap_cost=1, sim_func=sim_ident):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
2372
    """Return the Needleman-Wunsch score of two strings.
2373
2374
    The Needleman-Wunsch score :cite:`Needleman:1970` is a standard edit
2375
    distance measure.
2376
2377
    :param str src: source string for comparison
2378
    :param str tar: target string for comparison
2379
    :param float gap_cost: the cost of an alignment gap (1 by default)
2380
    :param function sim_func: a function that returns the similarity of two
2381
        characters (identity similarity by default)
2382
    :returns: Needleman-Wunsch score
2383
    :rtype: float
2384
2385
    >>> needleman_wunsch('cat', 'hat')
2386
    2.0
2387
    >>> needleman_wunsch('Niall', 'Neil')
2388
    1.0
2389
    >>> needleman_wunsch('aluminum', 'Catalan')
2390
    -1.0
2391
    >>> needleman_wunsch('ATCG', 'TAGC')
2392
    0.0
2393
    """
2394
    d_mat = np_zeros((len(src)+1, len(tar)+1), dtype=np_float32)
2395
2396
    for i in range(len(src)+1):
2397
        d_mat[i, 0] = -(i * gap_cost)
2398
    for j in range(len(tar)+1):
2399
        d_mat[0, j] = -(j * gap_cost)
2400
    for i in range(1, len(src)+1):
2401
        for j in range(1, len(tar)+1):
2402
            match = d_mat[i-1, j-1] + sim_func(src[i-1], tar[j-1])
2403
            delete = d_mat[i-1, j] - gap_cost
2404
            insert = d_mat[i, j-1] - gap_cost
2405
            d_mat[i, j] = max(match, delete, insert)
2406
    return d_mat[d_mat.shape[0]-1, d_mat.shape[1]-1]
2407
2408
2409 View Code Duplication
def smith_waterman(src, tar, gap_cost=1, sim_func=sim_ident):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
2410
    """Return the Smith-Waterman score of two strings.
2411
2412
    The Smith-Waterman score :cite:`Smith:1981` is a standard edit distance
2413
    measure, differing from Needleman-Wunsch in that it focuses on local
2414
    alignment and disallows negative scores.
2415
2416
    :param str src: source string for comparison
2417
    :param str tar: target string for comparison
2418
    :param float gap_cost: the cost of an alignment gap (1 by default)
2419
    :param function sim_func: a function that returns the similarity of two
2420
        characters (identity similarity by default)
2421
    :returns: Smith-Waterman score
2422
    :rtype: float
2423
2424
    >>> smith_waterman('cat', 'hat')
2425
    2.0
2426
    >>> smith_waterman('Niall', 'Neil')
2427
    1.0
2428
    >>> smith_waterman('aluminum', 'Catalan')
2429
    0.0
2430
    >>> smith_waterman('ATCG', 'TAGC')
2431
    1.0
2432
    """
2433
    d_mat = np_zeros((len(src)+1, len(tar)+1), dtype=np_float32)
2434
2435
    for i in range(len(src)+1):
2436
        d_mat[i, 0] = 0
2437
    for j in range(len(tar)+1):
2438
        d_mat[0, j] = 0
2439
    for i in range(1, len(src)+1):
2440
        for j in range(1, len(tar)+1):
2441
            match = d_mat[i-1, j-1] + sim_func(src[i-1], tar[j-1])
2442
            delete = d_mat[i-1, j] - gap_cost
2443
            insert = d_mat[i, j-1] - gap_cost
2444
            d_mat[i, j] = max(0, match, delete, insert)
2445
    return d_mat[d_mat.shape[0]-1, d_mat.shape[1]-1]
2446
2447
2448
def gotoh(src, tar, gap_open=1, gap_ext=0.4, sim_func=sim_ident):
2449
    """Return the Gotoh score of two strings.
2450
2451
    The Gotoh score :cite:`Gotoh:1982` is essentially Needleman-Wunsch with
2452
    affine gap penalties.
2453
2454
    :param str src: source string for comparison
2455
    :param str tar: target string for comparison
2456
    :param float gap_open: the cost of an open alignment gap (1 by default)
2457
    :param float gap_ext: the cost of an alignment gap extension (0.4 by
2458
        default)
2459
    :param function sim_func: a function that returns the similarity of two
2460
        characters (identity similarity by default)
2461
    :returns: Gotoh score
2462
    :rtype: float
2463
2464
    >>> gotoh('cat', 'hat')
2465
    2.0
2466
    >>> gotoh('Niall', 'Neil')
2467
    1.0
2468
    >>> round(gotoh('aluminum', 'Catalan'), 12)
2469
    -0.4
2470
    >>> gotoh('cat', 'hat')
2471
    2.0
2472
    """
2473
    d_mat = np_zeros((len(src)+1, len(tar)+1), dtype=np_float32)
2474
    p_mat = np_zeros((len(src)+1, len(tar)+1), dtype=np_float32)
2475
    q_mat = np_zeros((len(src)+1, len(tar)+1), dtype=np_float32)
2476
2477
    d_mat[0, 0] = 0
2478
    p_mat[0, 0] = float('-inf')
2479
    q_mat[0, 0] = float('-inf')
2480
    for i in range(1, len(src)+1):
2481
        d_mat[i, 0] = float('-inf')
2482
        p_mat[i, 0] = -gap_open - gap_ext*(i-1)
2483
        q_mat[i, 0] = float('-inf')
2484
        q_mat[i, 1] = -gap_open
2485
    for j in range(1, len(tar)+1):
2486
        d_mat[0, j] = float('-inf')
2487
        p_mat[0, j] = float('-inf')
2488
        p_mat[1, j] = -gap_open
2489
        q_mat[0, j] = -gap_open - gap_ext*(j-1)
2490
2491
    for i in range(1, len(src)+1):
2492
        for j in range(1, len(tar)+1):
2493
            sim_val = sim_func(src[i-1], tar[j-1])
2494
            d_mat[i, j] = max(d_mat[i-1, j-1] + sim_val,
2495
                              p_mat[i-1, j-1] + sim_val,
2496
                              q_mat[i-1, j-1] + sim_val)
2497
2498
            p_mat[i, j] = max(d_mat[i-1, j] - gap_open,
2499
                              p_mat[i-1, j] - gap_ext)
2500
2501
            q_mat[i, j] = max(d_mat[i, j-1] - gap_open,
2502
                              q_mat[i, j-1] - gap_ext)
2503
2504
    i, j = (n - 1 for n in d_mat.shape)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable n does not seem to be defined.
Loading history...
2505
    return max(d_mat[i, j], p_mat[i, j], q_mat[i, j])
2506
2507
2508
def sim_length(src, tar):
2509
    """Return the length similarity of two strings.
2510
2511
    Length similarity is the ratio of the length of the shorter string to the
2512
    longer.
2513
2514
    :param str src: source string for comparison
2515
    :param str tar: target string for comparison
2516
    :returns: length similarity
2517
    :rtype: float
2518
2519
    >>> sim_length('cat', 'hat')
2520
    1.0
2521
    >>> sim_length('Niall', 'Neil')
2522
    0.8
2523
    >>> sim_length('aluminum', 'Catalan')
2524
    0.875
2525
    >>> sim_length('ATCG', 'TAGC')
2526
    1.0
2527
    """
2528
    if src == tar:
2529
        return 1.0
2530
    if not src or not tar:
2531
        return 0.0
2532
    return len(src)/len(tar) if len(src) < len(tar) else len(tar)/len(src)
2533
2534
2535
def dist_length(src, tar):
2536
    """Return the length distance between two strings.
2537
2538
    Length distance is the complement of length similarity:
2539
    :math:`dist_{length} = 1 - sim_{length}`.
2540
2541
    :param str src: source string for comparison
2542
    :param str tar: target string for comparison
2543
    :returns: length distance
2544
    :rtype: float
2545
2546
    >>> dist_length('cat', 'hat')
2547
    0.0
2548
    >>> dist_length('Niall', 'Neil')
2549
    0.19999999999999996
2550
    >>> dist_length('aluminum', 'Catalan')
2551
    0.125
2552
    >>> dist_length('ATCG', 'TAGC')
2553
    0.0
2554
    """
2555
    return 1 - sim_length(src, tar)
2556
2557
2558 View Code Duplication
def sim_prefix(src, tar):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
2559
    """Return the prefix similarity of two strings.
2560
2561
    Prefix similarity is the ratio of the length of the shorter term that
2562
    exactly matches the longer term to the length of the shorter term,
2563
    beginning at the start of both terms.
2564
2565
    :param str src: source string for comparison
2566
    :param str tar: target string for comparison
2567
    :returns: prefix similarity
2568
    :rtype: float
2569
2570
    >>> sim_prefix('cat', 'hat')
2571
    0.0
2572
    >>> sim_prefix('Niall', 'Neil')
2573
    0.25
2574
    >>> sim_prefix('aluminum', 'Catalan')
2575
    0.0
2576
    >>> sim_prefix('ATCG', 'TAGC')
2577
    0.0
2578
    """
2579
    if src == tar:
2580
        return 1.0
2581
    if not src or not tar:
2582
        return 0.0
2583
    min_word, max_word = (src, tar) if len(src) < len(tar) else (tar, src)
2584
    min_len = len(min_word)
2585
    for i in range(min_len, 0, -1):
2586
        if min_word[:i] == max_word[:i]:
2587
            return i/min_len
2588
    return 0.0
2589
2590
2591
def dist_prefix(src, tar):
2592
    """Return the prefix distance between two strings.
2593
2594
    Prefix distance is the complement of prefix similarity:
2595
    :math:`dist_{prefix} = 1 - sim_{prefix}`.
2596
2597
    :param str src: source string for comparison
2598
    :param str tar: target string for comparison
2599
    :returns: prefix distance
2600
    :rtype: float
2601
2602
    >>> dist_prefix('cat', 'hat')
2603
    1.0
2604
    >>> dist_prefix('Niall', 'Neil')
2605
    0.75
2606
    >>> dist_prefix('aluminum', 'Catalan')
2607
    1.0
2608
    >>> dist_prefix('ATCG', 'TAGC')
2609
    1.0
2610
    """
2611
    return 1 - sim_prefix(src, tar)
2612
2613
2614 View Code Duplication
def sim_suffix(src, tar):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
2615
    """Return the suffix similarity of two strings.
2616
2617
    Suffix similarity is the ratio of the length of the shorter term that
2618
    exactly matches the longer term to the length of the shorter term,
2619
    beginning at the end of both terms.
2620
2621
    :param str src: source string for comparison
2622
    :param str tar: target string for comparison
2623
    :returns: suffix similarity
2624
    :rtype: float
2625
2626
    >>> sim_suffix('cat', 'hat')
2627
    0.6666666666666666
2628
    >>> sim_suffix('Niall', 'Neil')
2629
    0.25
2630
    >>> sim_suffix('aluminum', 'Catalan')
2631
    0.0
2632
    >>> sim_suffix('ATCG', 'TAGC')
2633
    0.0
2634
    """
2635
    if src == tar:
2636
        return 1.0
2637
    if not src or not tar:
2638
        return 0.0
2639
    min_word, max_word = (src, tar) if len(src) < len(tar) else (tar, src)
2640
    min_len = len(min_word)
2641
    for i in range(min_len, 0, -1):
2642
        if min_word[-i:] == max_word[-i:]:
2643
            return i/min_len
2644
    return 0.0
2645
2646
2647
def dist_suffix(src, tar):
2648
    """Return the suffix distance between two strings.
2649
2650
    Suffix distance is the complement of suffix similarity:
2651
    :math:`dist_{suffix} = 1 - sim_{suffix}`.
2652
2653
    :param str src: source string for comparison
2654
    :param str tar: target string for comparison
2655
    :returns: suffix distance
2656
    :rtype: float
2657
2658
    >>> dist_suffix('cat', 'hat')
2659
    0.33333333333333337
2660
    >>> dist_suffix('Niall', 'Neil')
2661
    0.75
2662
    >>> dist_suffix('aluminum', 'Catalan')
2663
    1.0
2664
    >>> dist_suffix('ATCG', 'TAGC')
2665
    1.0
2666
    """
2667
    return 1 - sim_suffix(src, tar)
2668
2669
2670
def sim_mlipns(src, tar, threshold=0.25, max_mismatches=2):
2671
    """Return the MLIPNS similarity of two strings.
2672
2673
    Modified Language-Independent Product Name Search (MLIPNS) is described in
2674
    :cite:`Shannaq:2010`. This function returns only 1.0 (similar) or 0.0
2675
    (not similar). LIPNS similarity is identical to normalized Hamming
2676
    similarity.
2677
2678
    :param str src: source string for comparison
2679
    :param str tar: target string for comparison
2680
    :param float threshold: a number [0, 1] indicating the maximum similarity
2681
        score, below which the strings are considered 'similar' (0.25 by
2682
        default)
2683
    :param int max_mismatches: a number indicating the allowable number of
2684
        mismatches to remove before declaring two strings not similar (2 by
2685
        default)
2686
    :returns: MLIPNS similarity
2687
    :rtype: float
2688
2689
    >>> sim_mlipns('cat', 'hat')
2690
    1.0
2691
    >>> sim_mlipns('Niall', 'Neil')
2692
    0.0
2693
    >>> sim_mlipns('aluminum', 'Catalan')
2694
    0.0
2695
    >>> sim_mlipns('ATCG', 'TAGC')
2696
    0.0
2697
    """
2698
    if tar == src:
2699
        return 1.0
2700
    if not src or not tar:
2701
        return 0.0
2702
2703
    mismatches = 0
2704
    ham = hamming(src, tar, diff_lens=True)
2705
    max_length = max(len(src), len(tar))
2706
    while src and tar and mismatches <= max_mismatches:
2707
        if max_length < 1 or (1-(max_length-ham)/max_length) <= threshold:
2708
            return 1.0
2709
        else:
2710
            mismatches += 1
2711
            ham -= 1
2712
            max_length -= 1
2713
2714
    if max_length < 1:
2715
        return 1.0
2716
    return 0.0
2717
2718
2719
def dist_mlipns(src, tar, threshold=0.25, max_mismatches=2):
2720
    """Return the MLIPNS distance between two strings.
2721
2722
    MLIPNS distance is the complement of MLIPNS similarity:
2723
    :math:`dist_{MLIPNS} = 1 - sim_{MLIPNS}`. This function returns only 0.0
2724
    (distant) or 1.0 (not distant).
2725
2726
    :param str src: source string for comparison
2727
    :param str tar: target string for comparison
2728
    :param float threshold: a number [0, 1] indicating the maximum similarity
2729
        score, below which the strings are considered 'similar' (0.25 by
2730
        default)
2731
    :param int max_mismatches: a number indicating the allowable number of
2732
        mismatches to remove before declaring two strings not similar (2 by
2733
        default)
2734
    :returns: MLIPNS distance
2735
    :rtype: float
2736
2737
    >>> dist_mlipns('cat', 'hat')
2738
    0.0
2739
    >>> dist_mlipns('Niall', 'Neil')
2740
    1.0
2741
    >>> dist_mlipns('aluminum', 'Catalan')
2742
    1.0
2743
    >>> dist_mlipns('ATCG', 'TAGC')
2744
    1.0
2745
    """
2746
    return 1.0 - sim_mlipns(src, tar, threshold, max_mismatches)
2747
2748
2749
def bag(src, tar):
2750
    """Return the bag distance between two strings.
2751
2752
    Bag distance is proposed in :cite:`Bartolini:2002`. It is defined as:
2753
    :math:`max(|multiset(src)-multiset(tar)|, |multiset(tar)-multiset(src)|)`.
2754
2755
    :param str src: source string for comparison
2756
    :param str tar: target string for comparison
2757
    :returns: bag distance
2758
    :rtype: int
2759
2760
    >>> bag('cat', 'hat')
2761
    1
2762
    >>> bag('Niall', 'Neil')
2763
    2
2764
    >>> bag('aluminum', 'Catalan')
2765
    5
2766
    >>> bag('ATCG', 'TAGC')
2767
    0
2768
    >>> bag('abcdefg', 'hijklm')
2769
    7
2770
    >>> bag('abcdefg', 'hijklmno')
2771
    8
2772
    """
2773
    if tar == src:
2774
        return 0
2775
    elif not src:
2776
        return len(tar)
2777
    elif not tar:
2778
        return len(src)
2779
2780
    src_bag = Counter(src)
2781
    tar_bag = Counter(tar)
2782
    return max(sum((src_bag-tar_bag).values()),
2783
               sum((tar_bag-src_bag).values()))
2784
2785
2786
def dist_bag(src, tar):
2787
    """Return the normalized bag distance between two strings.
2788
2789
    Bag distance is normalized by dividing by :math:`max( |src|, |tar| )`.
2790
2791
    :param str src: source string for comparison
2792
    :param str tar: target string for comparison
2793
    :returns: normalized bag distance
2794
    :rtype: float
2795
2796
    >>> dist_bag('cat', 'hat')
2797
    0.3333333333333333
2798
    >>> dist_bag('Niall', 'Neil')
2799
    0.4
2800
    >>> dist_bag('aluminum', 'Catalan')
2801
    0.625
2802
    >>> dist_bag('ATCG', 'TAGC')
2803
    0.0
2804
    """
2805
    if tar == src:
2806
        return 0.0
2807
    if not src or not tar:
2808
        return 1.0
2809
2810
    max_length = max(len(src), len(tar))
2811
2812
    return bag(src, tar)/max_length
2813
2814
2815
def sim_bag(src, tar):
2816
    """Return the normalized bag similarity of two strings.
2817
2818
    Normalized bag similarity is the complement of normalized bag distance:
2819
    :math:`sim_{bag} = 1 - dist_{bag}`.
2820
2821
    :param str src: source string for comparison
2822
    :param str tar: target string for comparison
2823
    :returns: normalized bag similarity
2824
    :rtype: float
2825
2826
    >>> round(sim_bag('cat', 'hat'), 12)
2827
    0.666666666667
2828
    >>> sim_bag('Niall', 'Neil')
2829
    0.6
2830
    >>> sim_bag('aluminum', 'Catalan')
2831
    0.375
2832
    >>> sim_bag('ATCG', 'TAGC')
2833
    1.0
2834
    """
2835
    return 1-dist_bag(src, tar)
2836
2837
2838
def editex(src, tar, cost=(0, 1, 2), local=False):
2839
    """Return the Editex distance between two strings.
2840
2841
    As described on pages 3 & 4 of :cite:`Zobel:1996`.
2842
2843
    The local variant is based on :cite:`Ring:2009`.
2844
2845
    :param str src: source string for comparison
2846
    :param str tar: target string for comparison
2847
    :param tuple cost: a 3-tuple representing the cost of the four possible
2848
        edits:
2849
        match, same-group, and mismatch respectively (by default: (0, 1, 2))
2850
    :param bool local: if True, the local variant of Editex is used
2851
    :returns: Editex distance
2852
    :rtype: int
2853
2854
    >>> editex('cat', 'hat')
2855
    2
2856
    >>> editex('Niall', 'Neil')
2857
    2
2858
    >>> editex('aluminum', 'Catalan')
2859
    12
2860
    >>> editex('ATCG', 'TAGC')
2861
    6
2862
    """
2863
    match_cost, group_cost, mismatch_cost = cost
2864
    letter_groups = ({'A', 'E', 'I', 'O', 'U', 'Y'},
2865
                     {'B', 'P'},
2866
                     {'C', 'K', 'Q'},
2867
                     {'D', 'T'},
2868
                     {'L', 'R'},
2869
                     {'M', 'N'},
2870
                     {'G', 'J'},
2871
                     {'F', 'P', 'V'},
2872
                     {'S', 'X', 'Z'},
2873
                     {'C', 'S', 'Z'})
2874
    all_letters = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'J', 'K', 'L', 'M',
2875
                   'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'X', 'Y', 'Z'}
2876
2877
    def r_cost(ch1, ch2):
2878
        """Return r(a,b) according to Zobel & Dart's definition."""
2879
        if ch1 == ch2:
2880
            return match_cost
2881
        if ch1 in all_letters and ch2 in all_letters:
2882
            for group in letter_groups:
2883
                if ch1 in group and ch2 in group:
2884
                    return group_cost
2885
        return mismatch_cost
2886
2887
    def d_cost(ch1, ch2):
2888
        """Return d(a,b) according to Zobel & Dart's definition."""
2889
        if ch1 != ch2 and (ch1 == 'H' or ch1 == 'W'):
2890
            return group_cost
2891
        return r_cost(ch1, ch2)
2892
2893
    # convert both src & tar to NFKD normalized unicode
2894
    src = unicode_normalize('NFKD', text_type(src.upper()))
2895
    tar = unicode_normalize('NFKD', text_type(tar.upper()))
2896
    # convert ß to SS (for Python2)
2897
    src = src.replace('ß', 'SS')
2898
    tar = tar.replace('ß', 'SS')
2899
2900
    if src == tar:
2901
        return 0
2902
    if not src:
2903
        return len(tar) * mismatch_cost
2904
    if not tar:
2905
        return len(src) * mismatch_cost
2906
2907
    d_mat = np_zeros((len(src)+1, len(tar)+1), dtype=np_int)
2908
    lens = len(src)
2909
    lent = len(tar)
2910
    src = ' '+src
2911
    tar = ' '+tar
2912
2913
    if not local:
2914
        for i in range(1, lens+1):
2915
            d_mat[i, 0] = d_mat[i-1, 0] + d_cost(src[i-1], src[i])
2916
    for j in range(1, lent+1):
2917
        d_mat[0, j] = d_mat[0, j-1] + d_cost(tar[j-1], tar[j])
2918
2919
    for i in range(1, lens+1):
2920
        for j in range(1, lent+1):
2921
            d_mat[i, j] = min(d_mat[i-1, j] + d_cost(src[i-1], src[i]),
2922
                              d_mat[i, j-1] + d_cost(tar[j-1], tar[j]),
2923
                              d_mat[i-1, j-1] + r_cost(src[i], tar[j]))
2924
2925
    return d_mat[lens, lent]
2926
2927
2928
def dist_editex(src, tar, cost=(0, 1, 2), local=False):
2929
    """Return the normalized Editex distance between two strings.
2930
2931
    The Editex distance is normalized by dividing the Editex distance
2932
    (calculated by any of the three supported methods) by the greater of
2933
    the number of characters in src times the cost of a delete and
2934
    the number of characters in tar times the cost of an insert.
2935
    For the case in which all operations have :math:`cost = 1`, this is
2936
    equivalent to the greater of the length of the two strings src & tar.
2937
2938
    :param str src: source string for comparison
2939
    :param str tar: target string for comparison
2940
    :param tuple cost: a 3-tuple representing the cost of the four possible
2941
        edits:
2942
        match, same-group, and mismatch respectively (by default: (0, 1, 2))
2943
    :param bool local: if True, the local variant of Editex is used
2944
    :returns: normalized Editex distance
2945
    :rtype: float
2946
2947
    >>> round(dist_editex('cat', 'hat'), 12)
2948
    0.333333333333
2949
    >>> round(dist_editex('Niall', 'Neil'), 12)
2950
    0.2
2951
    >>> dist_editex('aluminum', 'Catalan')
2952
    0.75
2953
    >>> dist_editex('ATCG', 'TAGC')
2954
    0.75
2955
    """
2956
    if src == tar:
2957
        return 0
2958
    mismatch_cost = cost[2]
2959
    return (editex(src, tar, cost, local) /
2960
            (max(len(src)*mismatch_cost, len(tar)*mismatch_cost)))
2961
2962
2963
def sim_editex(src, tar, cost=(0, 1, 2), local=False):
2964
    """Return the normalized Editex similarity of two strings.
2965
2966
    The Editex similarity is the complement of Editex distance:
2967
    :math:`sim_{Editex} = 1 - dist_{Editex}`.
2968
2969
    :param str src: source string for comparison
2970
    :param str tar: target string for comparison
2971
    :param tuple cost: a 3-tuple representing the cost of the four possible
2972
        edits:
2973
        match, same-group, and mismatch respectively (by default: (0, 1, 2))
2974
    :param bool local: if True, the local variant of Editex is used
2975
    :returns: normalized Editex similarity
2976
    :rtype: float
2977
2978
    >>> round(sim_editex('cat', 'hat'), 12)
2979
    0.666666666667
2980
    >>> round(sim_editex('Niall', 'Neil'), 12)
2981
    0.8
2982
    >>> sim_editex('aluminum', 'Catalan')
2983
    0.25
2984
    >>> sim_editex('ATCG', 'TAGC')
2985
    0.25
2986
    """
2987
    return 1 - dist_editex(src, tar, cost, local)
2988
2989
2990
def eudex_hamming(src, tar, weights='exponential', max_length=8,
2991
                  normalized=False):
2992
    """Calculate the Hamming distance between the Eudex hashes of two terms.
2993
2994
    Cf. :cite:`Ticki:2016`.
2995
2996
    - If weights is set to None, a simple Hamming distance is calculated.
2997
    - If weights is set to 'exponential', weight decays by powers of 2, as
2998
      proposed in the eudex specification: https://github.com/ticki/eudex.
2999
    - If weights is set to 'fibonacci', weight decays through the Fibonacci
3000
      series, as in the eudex reference implementation.
3001
    - If weights is set to a callable function, this assumes it creates a
3002
      generator and the generator is used to populate a series of weights.
3003
    - If weights is set to an iterable, the iterable's values should be
3004
      integers and will be used as the weights.
3005
3006
    :param str src: source string for comparison
3007
    :param str tar: target string for comparison
3008
    :param str, iterable, or generator function weights: the weights or weights
3009
        generator function
3010
    :param max_length: the number of characters to encode as a eudex hash
3011
    :param bool normalized: normalizes to [0, 1] if True
3012
    :returns: the Eudex Hamming distance
3013
    :rtype: int
3014
3015
    >>> eudex_hamming('cat', 'hat')
3016
    128
3017
    >>> eudex_hamming('Niall', 'Neil')
3018
    2
3019
    >>> eudex_hamming('Colin', 'Cuilen')
3020
    10
3021
    >>> eudex_hamming('ATCG', 'TAGC')
3022
    403
3023
3024
    >>> eudex_hamming('cat', 'hat', weights='fibonacci')
3025
    34
3026
    >>> eudex_hamming('Niall', 'Neil', weights='fibonacci')
3027
    2
3028
    >>> eudex_hamming('Colin', 'Cuilen', weights='fibonacci')
3029
    7
3030
    >>> eudex_hamming('ATCG', 'TAGC', weights='fibonacci')
3031
    117
3032
3033
    >>> eudex_hamming('cat', 'hat', weights=None)
3034
    1
3035
    >>> eudex_hamming('Niall', 'Neil', weights=None)
3036
    1
3037
    >>> eudex_hamming('Colin', 'Cuilen', weights=None)
3038
    2
3039
    >>> eudex_hamming('ATCG', 'TAGC', weights=None)
3040
    9
3041
3042
    >>> # Using the OEIS A000142:
3043
    >>> eudex_hamming('cat', 'hat', [1, 1, 2, 6, 24, 120, 720, 5040])
3044
    1
3045
    >>> eudex_hamming('Niall', 'Neil', [1, 1, 2, 6, 24, 120, 720, 5040])
3046
    720
3047
    >>> eudex_hamming('Colin', 'Cuilen', [1, 1, 2, 6, 24, 120, 720, 5040])
3048
    744
3049
    >>> eudex_hamming('ATCG', 'TAGC', [1, 1, 2, 6, 24, 120, 720, 5040])
3050
    6243
3051
    """
3052
    def _gen_fibonacci():
3053
        """Yield the next Fibonacci number.
3054
3055
        Based on https://www.python-course.eu/generators.php
3056
        Starts at Fibonacci number 3 (the second 1)
3057
3058
        :returns: the next Fibonacci number
3059
        :rtype: int
3060
        """
3061
        num_a, num_b = 1, 2
3062
        while True:
3063
            yield num_a
3064
            num_a, num_b = num_b, num_a + num_b
3065
3066
    def _gen_exponential(base=2):
3067
        """Yield the next value in an exponential series of the base.
3068
3069
        Starts at base**0
3070
3071
        :param int base: the base to exponentiate
3072
        :returns: the next power of `base`
3073
        :rtype: int
3074
        """
3075
        exp = 0
3076
        while True:
3077
            yield base ** exp
3078
            exp += 1
3079
3080
    # Calculate the eudex hashes and XOR them
3081
    xored = (eudex(src, max_length=max_length) ^
3082
             eudex(tar, max_length=max_length))
3083
3084
    # Simple hamming distance (all bits are equal)
3085
    if not weights:
3086
        binary = bin(xored)
3087
        distance = binary.count('1')
3088
        if normalized:
3089
            return distance/(len(binary)-2)
3090
        return distance
3091
3092
    # If weights is a function, it should create a generator,
3093
    # which we now use to populate a list
3094
    if callable(weights):
3095
        weights = weights()
3096
    elif weights == 'exponential':
3097
        weights = _gen_exponential()
3098
    elif weights == 'fibonacci':
3099
        weights = _gen_fibonacci()
3100
    if isinstance(weights, GeneratorType):
3101
        weights = [next(weights) for _ in range(max_length)][::-1]
3102
3103
    # Sum the weighted hamming distance
3104
    distance = 0
3105
    max_distance = 0
3106
    while (xored or normalized) and weights:
3107
        max_distance += 8*weights[-1]
3108
        distance += bin(xored & 0xFF).count('1') * weights.pop()
3109
        xored >>= 8
3110
3111
    if normalized:
3112
        distance /= max_distance
3113
3114
    return distance
3115
3116
3117
def dist_eudex(src, tar, weights='exponential', max_length=8):
3118
    """Return normalized Hamming distance between Eudex hashes of two terms.
3119
3120
    This is Eudex distance normalized to [0, 1].
3121
3122
    :param str src: source string for comparison
3123
    :param str tar: target string for comparison
3124
    :param str, iterable, or generator function weights: the weights or weights
3125
        generator function
3126
    :param max_length: the number of characters to encode as a eudex hash
3127
    :returns: the normalized Eudex distance
3128
    :rtype: float
3129
3130
    >>> round(dist_eudex('cat', 'hat'), 12)
3131
    0.062745098039
3132
    >>> round(dist_eudex('Niall', 'Neil'), 12)
3133
    0.000980392157
3134
    >>> round(dist_eudex('Colin', 'Cuilen'), 12)
3135
    0.004901960784
3136
    >>> round(dist_eudex('ATCG', 'TAGC'), 12)
3137
    0.197549019608
3138
    """
3139
    return eudex_hamming(src, tar, weights, max_length, True)
3140
3141
3142
def sim_eudex(src, tar, weights='exponential', max_length=8):
3143
    """Return normalized Hamming similarity between Eudex hashes of two terms.
3144
3145
    Normalized Eudex similarity is the complement of normalized Eudex distance:
3146
    :math:`sim_{Eudex} = 1 - dist_{Eudex}`.
3147
3148
    :param str src: source string for comparison
3149
    :param str tar: target string for comparison
3150
    :param str, iterable, or generator function weights: the weights or weights
3151
        generator function
3152
    :param max_length: the number of characters to encode as a eudex hash
3153
    :returns: the normalized Eudex similarity
3154
    :rtype: float
3155
3156
    >>> round(sim_eudex('cat', 'hat'), 12)
3157
    0.937254901961
3158
    >>> round(sim_eudex('Niall', 'Neil'), 12)
3159
    0.999019607843
3160
    >>> round(sim_eudex('Colin', 'Cuilen'), 12)
3161
    0.995098039216
3162
    >>> round(sim_eudex('ATCG', 'TAGC'), 12)
3163
    0.802450980392
3164
    """
3165
    return 1-dist_eudex(src, tar, weights, max_length)
3166
3167
3168
def sift4_simplest(src, tar, max_offset=5):
3169
    """Return the "simplest" Sift4 distance between two terms.
3170
3171
    This is an approximation of edit distance, described in
3172
    :cite:`Zackwehdex:2014`.
3173
3174
    :param str src: source string for comparison
3175
    :param str tar: target string for comparison
3176
    :param max_offset: the number of characters to search for matching letters
3177
    :returns: the Sift4 distance according to the simplest formula
3178
    :rtype: int
3179
3180
    >>> sift4_simplest('cat', 'hat')
3181
    1
3182
    >>> sift4_simplest('Niall', 'Neil')
3183
    2
3184
    >>> sift4_simplest('Colin', 'Cuilen')
3185
    3
3186
    >>> sift4_simplest('ATCG', 'TAGC')
3187
    2
3188
    """
3189
    if not src:
3190
        return len(tar)
3191
3192
    if not tar:
3193
        return len(src)
3194
3195
    src_len = len(src)
3196
    tar_len = len(tar)
3197
3198
    src_cur = 0
3199
    tar_cur = 0
3200
    lcss = 0
3201
    local_cs = 0
3202
3203
    while (src_cur < src_len) and (tar_cur < tar_len):
3204
        if src[src_cur] == tar[tar_cur]:
3205
            local_cs += 1
3206
        else:
3207
            lcss += local_cs
3208
            local_cs = 0
3209
            if src_cur != tar_cur:
3210
                src_cur = tar_cur = max(src_cur, tar_cur)
3211
            for i in range(max_offset):
3212
                if not ((src_cur+i < src_len) or (tar_cur+i < tar_len)):
3213
                    break
3214
                if (src_cur+i < src_len) and (src[src_cur+i] == tar[tar_cur]):
3215
                    src_cur += i
3216
                    local_cs += 1
3217
                    break
3218
                if (tar_cur+i < tar_len) and (src[src_cur] == tar[tar_cur+i]):
3219
                    tar_cur += i
3220
                    local_cs += 1
3221
                    break
3222
3223
        src_cur += 1
3224
        tar_cur += 1
3225
3226
    lcss += local_cs
3227
    return round(max(src_len, tar_len) - lcss)
3228
3229
3230
def sift4_common(src, tar, max_offset=5, max_distance=0):
3231
    """Return the "common" Sift4 distance between two terms.
3232
3233
    This is an approximation of edit distance, described in
3234
    :cite:`Zackwehdex:2014`.
3235
3236
    :param str src: source string for comparison
3237
    :param str tar: target string for comparison
3238
    :param max_offset: the number of characters to search for matching letters
3239
    :param max_distance: the distance at which to stop and exit
3240
    :returns: the Sift4 distance according to the common formula
3241
    :rtype: int
3242
3243
    >>> sift4_common('cat', 'hat')
3244
    1
3245
    >>> sift4_common('Niall', 'Neil')
3246
    2
3247
    >>> sift4_common('Colin', 'Cuilen')
3248
    3
3249
    >>> sift4_common('ATCG', 'TAGC')
3250
    2
3251
    """
3252
    if not src:
3253
        return len(tar)
3254
3255
    if not tar:
3256
        return len(src)
3257
3258
    src_len = len(src)
3259
    tar_len = len(tar)
3260
3261
    src_cur = 0
3262
    tar_cur = 0
3263
    lcss = 0
3264
    local_cs = 0
3265
    trans = 0
3266
    offset_arr = []
3267
3268
    while (src_cur < src_len) and (tar_cur < tar_len):
3269
        if src[src_cur] == tar[tar_cur]:
3270
            local_cs += 1
3271
            is_trans = False
3272
            i = 0
3273
            while i < len(offset_arr):
3274
                ofs = offset_arr[i]
3275
                if src_cur <= ofs['src_cur'] or tar_cur <= ofs['tar_cur']:
3276
                    is_trans = (abs(tar_cur-src_cur) >=
3277
                                abs(ofs['tar_cur']-ofs['src_cur']))
3278
                    if is_trans:
3279
                        trans += 1
3280
                    elif not ofs['trans']:
3281
                        ofs['trans'] = True
3282
                        trans += 1
3283
                    break
3284
                elif src_cur > ofs['tar_cur'] and tar_cur > ofs['src_cur']:
3285
                    del offset_arr[i]
3286
                else:
3287
                    i += 1
3288
3289
            offset_arr.append({'src_cur': src_cur, 'tar_cur': tar_cur,
3290
                               'trans': is_trans})
3291
        else:
3292
            lcss += local_cs
3293
            local_cs = 0
3294
            if src_cur != tar_cur:
3295
                src_cur = tar_cur = min(src_cur, tar_cur)
3296
            for i in range(max_offset):
3297
                if not ((src_cur+i < src_len) or (tar_cur+i < tar_len)):
3298
                    break
3299
                if (src_cur+i < src_len) and (src[src_cur+i] == tar[tar_cur]):
3300
                    src_cur += i-1
3301
                    tar_cur -= 1
3302
                    break
3303
                if (tar_cur+i < tar_len) and (src[src_cur] == tar[tar_cur+i]):
3304
                    src_cur -= 1
3305
                    tar_cur += i-1
3306
                    break
3307
3308
        src_cur += 1
3309
        tar_cur += 1
3310
3311
        if max_distance:
3312
            temporary_distance = max(src_cur, tar_cur) - lcss + trans
3313
            if temporary_distance >= max_distance:
3314
                return round(temporary_distance)
3315
3316
        if (src_cur >= src_len) or (tar_cur >= tar_len):
3317
            lcss += local_cs
3318
            local_cs = 0
3319
            src_cur = tar_cur = min(src_cur, tar_cur)
3320
3321
    lcss += local_cs
3322
    return round(max(src_len, tar_len) - lcss + trans)
3323
3324
3325
def dist_sift4(src, tar, max_offset=5, max_distance=0):
3326
    """Return the normalized "common" Sift4 distance between two terms.
3327
3328
    This is Sift4 distance, normalized to [0, 1].
3329
3330
    :param str src: source string for comparison
3331
    :param str tar: target string for comparison
3332
    :param max_offset: the number of characters to search for matching letters
3333
    :param max_distance: the distance at which to stop and exit
3334
    :returns: the normalized Sift4 distance
3335
    :rtype: float
3336
3337
    >>> round(dist_sift4('cat', 'hat'), 12)
3338
    0.333333333333
3339
    >>> dist_sift4('Niall', 'Neil')
3340
    0.4
3341
    >>> dist_sift4('Colin', 'Cuilen')
3342
    0.5
3343
    >>> dist_sift4('ATCG', 'TAGC')
3344
    0.5
3345
    """
3346
    return (sift4_common(src, tar, max_offset, max_distance) /
3347
            (max(len(src), len(tar), 1)))
3348
3349
3350
def sim_sift4(src, tar, max_offset=5, max_distance=0):
3351
    """Return the normalized "common" Sift4 similarity of two terms.
3352
3353
    Normalized Sift4 similarity is the complement of normalized Sift4 distance:
3354
    :math:`sim_{Sift4} = 1 - dist_{Sift4}`.
3355
3356
    :param str src: source string for comparison
3357
    :param str tar: target string for comparison
3358
    :param max_offset: the number of characters to search for matching letters
3359
    :param max_distance: the distance at which to stop and exit
3360
    :returns: the normalized Sift4 similarity
3361
    :rtype: float
3362
3363
    >>> round(sim_sift4('cat', 'hat'), 12)
3364
    0.666666666667
3365
    >>> sim_sift4('Niall', 'Neil')
3366
    0.6
3367
    >>> sim_sift4('Colin', 'Cuilen')
3368
    0.5
3369
    >>> sim_sift4('ATCG', 'TAGC')
3370
    0.5
3371
    """
3372
    return 1-dist_sift4(src, tar, max_offset, max_distance)
3373
3374
3375
def sim_baystat(src, tar, min_ss_len=None, left_ext=None, right_ext=None):
3376
    """Return the Baystat similarity.
3377
3378
    Good results for shorter words are reported when setting min_ss_len to 1
3379
    and either left_ext OR right_ext to 1.
3380
3381
    The Baystat similarity is defined in :cite:`Furnohr:2002`.
3382
3383
    This is ostensibly a port of the R module PPRL's implementation:
3384
    https://github.com/cran/PPRL/blob/master/src/MTB_Baystat.cpp
3385
    :cite:`Rukasz:2018`. As such, this could be made more pythonic.
3386
3387
    :param str src: source string for comparison
3388
    :param str tar: target string for comparison
3389
    :param int min_ss_len: minimum substring length to be considered
3390
    :param int left_ext: left-side extension length
3391
    :param int right_ext: right-side extension length
3392
    :returns: the Baystat similarity
3393
    :rtype: float
3394
3395
    >>> round(sim_baystat('cat', 'hat'), 12)
3396
    0.666666666667
3397
    >>> sim_baystat('Niall', 'Neil')
3398
    0.4
3399
    >>> round(sim_baystat('Colin', 'Cuilen'), 12)
3400
    0.166666666667
3401
    >>> sim_baystat('ATCG', 'TAGC')
3402
    0.0
3403
    """
3404
    if src == tar:
3405
        return 1
3406
    if not src or not tar:
3407
        return 0
3408
3409
    max_len = max(len(src), len(tar))
3410
3411
    if not (min_ss_len and left_ext and right_ext):
3412
        # These can be set via arguments to the function. Otherwise they are
3413
        # set automatically based on values from the article.
3414
        if max_len >= 7:
3415
            min_ss_len = 2
3416
            left_ext = 2
3417
            right_ext = 2
3418
        else:
3419
            # The paper suggests that for short names, (exclusively) one or the
3420
            # other of left_ext and right_ext can be 1, with good results.
3421
            # I use 0 & 0 as the default in this case.
3422
            min_ss_len = 1
3423
            left_ext = 0
3424
            right_ext = 0
3425
3426
    pos = 0
3427
    match_len = 0
3428
3429
    while True:
3430
        if pos + min_ss_len > len(src):
3431
            return match_len/max_len
3432
3433
        hit_len = 0
3434
        ix = 1
3435
3436
        substring = src[pos:pos + min_ss_len]
3437
        search_begin = pos - left_ext
3438
3439
        if search_begin < 0:
3440
            search_begin = 0
3441
            left_ext_len = pos
3442
        else:
3443
            left_ext_len = left_ext
3444
3445
        if pos + min_ss_len + right_ext >= len(tar):
3446
            right_ext_len = len(tar) - pos - min_ss_len
3447
        else:
3448
            right_ext_len = right_ext
3449
3450
        if (search_begin + left_ext_len + min_ss_len + right_ext_len >
3451
                search_begin):
3452
            search_val = tar[search_begin:(search_begin + left_ext_len +
3453
                                           min_ss_len + right_ext_len)]
3454
        else:
3455
            search_val = ''
3456
3457
        flagged_tar = ''
3458
        while substring in search_val and pos + ix <= len(src):
3459
            hit_len = len(substring)
3460
            flagged_tar = tar.replace(substring, '#'*hit_len)
3461
3462
            if pos + min_ss_len + ix <= len(src):
3463
                substring = src[pos:pos + min_ss_len + ix]
3464
3465
            if pos+min_ss_len + right_ext_len + 1 <= len(tar):
3466
                right_ext_len += 1
3467
3468
            # The following is unnecessary, I think
3469
            # if (search_begin + left_ext_len + min_ss_len + right_ext_len <=
3470
            #         len(tar)):
3471
            search_val = tar[search_begin:(search_begin + left_ext_len +
3472
                                           min_ss_len + right_ext_len)]
3473
3474
            ix += 1
3475
3476
        if hit_len > 0:
3477
            tar = flagged_tar
3478
3479
        match_len += hit_len
3480
        pos += ix
3481
3482
3483
def dist_baystat(src, tar, min_ss_len=None, left_ext=None, right_ext=None):
3484
    """Return the Baystat distance.
3485
3486
    Normalized Baystat similarity is the complement of normalized Baystat
3487
    distance: :math:`sim_{Baystat} = 1 - dist_{Baystat}`.
3488
3489
    :param str src: source string for comparison
3490
    :param str tar: target string for comparison
3491
    :param int min_ss_len: minimum substring length to be considered
3492
    :param int left_ext: left-side extension length
3493
    :param int right_ext: right-side extension length
3494
    :returns: the Baystat distance
3495
    :rtype: float
3496
3497
    >>> round(dist_baystat('cat', 'hat'), 12)
3498
    0.333333333333
3499
    >>> dist_baystat('Niall', 'Neil')
3500
    0.6
3501
    >>> round(dist_baystat('Colin', 'Cuilen'), 12)
3502
    0.833333333333
3503
    >>> dist_baystat('ATCG', 'TAGC')
3504
    1.0
3505
    """
3506
    return 1-sim_baystat(src, tar, min_ss_len, left_ext, right_ext)
3507
3508
3509
def typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5), layout='QWERTY'):
3510
    """Return the typo distance between two strings.
3511
3512
    This is inspired by Typo-Distance :cite:`Song:2011`, and a fair bit of
3513
    this was copied from that module. Compared to the original, this supports
3514
    different metrics for substitution.
3515
3516
    :param str src: source string for comparison
3517
    :param str tar: target string for comparison
3518
    :param str metric: supported values include: 'euclidean', 'manhattan',
3519
          'log-euclidean', and 'log-manhattan'
3520
    :param tuple cost: a 4-tuple representing the cost of the four possible
3521
        edits: inserts, deletes, substitutions, and shift, respectively (by
3522
        default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be
3523
        significantly less than the cost of an insertion & deletion unless
3524
        a log metric is used.
3525
    :param str layout: name of the keyboard layout to use (Currently supported:
3526
        QWERTY, Dvorak, AZERTY, QWERTZ)
3527
    :returns: typo distance
3528
    :rtype: float
3529
3530
    >>> typo('cat', 'hat')
3531
    1.5811388
3532
    >>> typo('Niall', 'Neil')
3533
    2.8251407
3534
    >>> typo('Colin', 'Cuilen')
3535
    3.4142137
3536
    >>> typo('ATCG', 'TAGC')
3537
    2.5
3538
3539
    >>> typo('cat', 'hat', metric='manhattan')
3540
    2.0
3541
    >>> typo('Niall', 'Neil', metric='manhattan')
3542
    3.0
3543
    >>> typo('Colin', 'Cuilen', metric='manhattan')
3544
    3.5
3545
    >>> typo('ATCG', 'TAGC', metric='manhattan')
3546
    2.5
3547
3548
    >>> typo('cat', 'hat', metric='log-manhattan')
3549
    0.804719
3550
    >>> typo('Niall', 'Neil', metric='log-manhattan')
3551
    2.2424533
3552
    >>> typo('Colin', 'Cuilen', metric='log-manhattan')
3553
    2.2424533
3554
    >>> typo('ATCG', 'TAGC', metric='log-manhattan')
3555
    2.3465736
3556
    """
3557
    ins_cost, del_cost, sub_cost, shift_cost = cost
3558
3559
    if src == tar:
3560
        return 0.0
3561
    if not src:
3562
        return len(tar) * ins_cost
3563
    if not tar:
3564
        return len(src) * del_cost
3565
3566
    kbs = {'QWERTY': (
3567
        (('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '-', '='),
3568
         ('', 'q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '[', ']',
3569
          '\\'),
3570
         ('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', ';', '\''),
3571
         ('', 'z', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '/')),
3572
        (('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+'),
3573
         ('', 'Q', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '{', '}', '|'),
3574
         ('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', ':', '"'),
3575
         ('', 'Z', 'X', 'C', 'V', 'B', 'N', 'M', '<', '>', '?'))
3576
    ), 'Dvorak': (
3577
        (('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '[', ']'),
3578
         ('', '\'', ',', '.', 'p', 'y', 'f', 'g', 'c', 'r', 'l', '/', '=',
3579
          '\\'),
3580
         ('', 'a', 'o', 'e', 'u', 'i', 'd', 'h', 't', 'n', 's', '-'),
3581
         ('', ';', 'q', 'j', 'k', 'x', 'b', 'm', 'w', 'v', 'z')),
3582
        (('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '{', '}'),
3583
         ('', '"', '<', '>', 'P', 'Y', 'F', 'G', 'C', 'R', 'L', '?', '+', '|'),
3584
         ('', 'A', 'O', 'E', 'U', 'I', 'D', 'H', 'T', 'N', 'S', '_'),
3585
         ('', ':', 'Q', 'J', 'K', 'X', 'B', 'M', 'W', 'V', 'Z'))
3586
    ), 'AZERTY': (
3587
        (('²', '&', 'é', '"', '\'', '(', '-', 'è', '_', 'ç', 'à', ')', '='),
3588
         ('', 'a', 'z', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '', '$'),
3589
         ('', 'q', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'ù', '*'),
3590
         ('<', 'w', 'x', 'c', 'v', 'b', 'n', ',', ';', ':', '!')),
3591
        (('~', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '°', '+'),
3592
         ('', 'A', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '', '£'),
3593
         ('', 'Q', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'Ù', 'μ'),
3594
         ('>', 'W', 'X', 'C', 'V', 'B', 'N', '?', '.', '/', '§'))
3595
    ), 'QWERTZ': (
3596
        (('', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'ß', ''),
3597
         ('', 'q', 'w', 'e', 'r', 't', 'z', 'u', 'i', 'o', 'p', ' ü', '+',
3598
          '\\'),
3599
         ('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'ö', 'ä', '#'),
3600
         ('<', 'y', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '-')),
3601
        (('°', '!', '"', '§', '$', '%', '&', '/', '(', ')', '=', '?', ''),
3602
         ('', 'Q', 'W', 'E', 'R', 'T', 'Z', 'U', 'I', 'O', 'P', 'Ü', '*', ''),
3603
         ('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Ö', 'Ä', '\''),
3604
         ('>', 'Y', 'X', 'C', 'V', 'B', 'N', 'M', ';', ':', '_'))
3605
    )}
3606
3607
    keyboard = kbs[layout]
3608
    lowercase = {item for sublist in keyboard[0] for item in sublist}
3609
    uppercase = {item for sublist in keyboard[1] for item in sublist}
3610
3611
    def _kb_array_for_char(char):
3612
        """Return the keyboard layout that contains ch."""
3613
        if char in lowercase:
3614
            return keyboard[0]
3615
        elif char in uppercase:
3616
            return keyboard[1]
3617
        raise ValueError(char + ' not found in any keyboard layouts')
3618
3619
    def _get_char_coord(char, kb_array):
3620
        """Return the row & column of char in the keyboard."""
3621
        for row in kb_array:  # pragma: no branch
3622
            if char in row:
3623
                return kb_array.index(row), row.index(char)
3624
3625
    def _euclidean_keyboard_distance(char1, char2):
3626
        row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1))
3627
        row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2))
3628
        return ((row1 - row2) ** 2 + (col1 - col2) ** 2) ** 0.5
3629
3630
    def _manhattan_keyboard_distance(char1, char2):
3631
        row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1))
3632
        row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2))
3633
        return abs(row1 - row2) + abs(col1 - col2)
3634
3635
    def _log_euclidean_keyboard_distance(char1, char2):
3636
        return log(1 + _euclidean_keyboard_distance(char1, char2))
3637
3638
    def _log_manhattan_keyboard_distance(char1, char2):
3639
        return log(1 + _manhattan_keyboard_distance(char1, char2))
3640
3641
    metric_dict = {'euclidean': _euclidean_keyboard_distance,
3642
                   'manhattan': _manhattan_keyboard_distance,
3643
                   'log-euclidean': _log_euclidean_keyboard_distance,
3644
                   'log-manhattan': _log_manhattan_keyboard_distance}
3645
3646
    def _substitution_cost(char1, char2):
3647
        cost = sub_cost
3648
        cost *= (metric_dict[metric](char1, char2) +
3649
                 shift_cost * (_kb_array_for_char(char1) !=
3650
                               _kb_array_for_char(char2)))
3651
        return cost
3652
3653
    d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32)
3654
    for i in range(len(src) + 1):
3655
        d_mat[i, 0] = i * del_cost
3656
    for j in range(len(tar) + 1):
3657
        d_mat[0, j] = j * ins_cost
3658
3659
    for i in range(len(src)):
3660
        for j in range(len(tar)):
3661
            d_mat[i + 1, j + 1] = min(
3662
                d_mat[i + 1, j] + ins_cost,  # ins
3663
                d_mat[i, j + 1] + del_cost,  # del
3664
                d_mat[i, j] + (_substitution_cost(src[i], tar[j])
3665
                               if src[i] != tar[j] else 0)  # sub/==
3666
            )
3667
3668
    return d_mat[len(src), len(tar)]
3669
3670
3671
def dist_typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5)):
3672
    """Return the normalized typo distance between two strings.
3673
3674
    This is typo distance, normalized to [0, 1].
3675
3676
    :param str src: source string for comparison
3677
    :param str tar: target string for comparison
3678
    :param str metric: supported values include: 'euclidean', 'manhattan',
3679
          'log-euclidean', and 'log-manhattan'
3680
    :param tuple cost: a 4-tuple representing the cost of the four possible
3681
        edits: inserts, deletes, substitutions, and shift, respectively (by
3682
        default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be
3683
        significantly less than the cost of an insertion & deletion unless
3684
        a log metric is used.
3685
    :returns: normalized typo distance
3686
    :rtype: float
3687
3688
    >>> round(dist_typo('cat', 'hat'), 12)
3689
    0.527046283086
3690
    >>> round(dist_typo('Niall', 'Neil'), 12)
3691
    0.565028142929
3692
    >>> round(dist_typo('Colin', 'Cuilen'), 12)
3693
    0.569035609563
3694
    >>> dist_typo('ATCG', 'TAGC')
3695
    0.625
3696
    """
3697
    if src == tar:
3698
        return 0
3699
    ins_cost, del_cost = cost[:2]
3700
    return (typo(src, tar, metric, cost) /
3701
            (max(len(src)*del_cost, len(tar)*ins_cost)))
3702
3703
3704
def sim_typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5)):
3705
    """Return the normalized typo similarity between two strings.
3706
3707
    Normalized typo similarity is the complement of normalized typo distance:
3708
    :math:`sim_{typo} = 1 - dist_{typo}`.
3709
3710
    :param str src: source string for comparison
3711
    :param str tar: target string for comparison
3712
    :param str metric: supported values include: 'euclidean', 'manhattan',
3713
          'log-euclidean', and 'log-manhattan'
3714
    :param tuple cost: a 4-tuple representing the cost of the four possible
3715
        edits: inserts, deletes, substitutions, and shift, respectively (by
3716
        default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be
3717
        significantly less than the cost of an insertion & deletion unless
3718
        a log metric is used.
3719
    :returns: normalized typo similarity
3720
    :rtype: float
3721
3722
    >>> round(sim_typo('cat', 'hat'), 12)
3723
    0.472953716914
3724
    >>> round(sim_typo('Niall', 'Neil'), 12)
3725
    0.434971857071
3726
    >>> round(sim_typo('Colin', 'Cuilen'), 12)
3727
    0.430964390437
3728
    >>> sim_typo('ATCG', 'TAGC')
3729
    0.375
3730
    """
3731
    return 1 - dist_typo(src, tar, metric, cost)
3732
3733
3734
def dist_indel(src, tar):
3735
    """Return the indel distance between two strings.
3736
3737
    This is equivalent to levenshtein distance, when only inserts and deletes
3738
    are possible.
3739
3740
    :param str src: source string for comparison
3741
    :param str tar: target string for comparison
3742
    :returns: indel distance
3743
    :rtype: float
3744
3745
    >>> round(dist_indel('cat', 'hat'), 12)
3746
    0.333333333333
3747
    >>> round(dist_indel('Niall', 'Neil'), 12)
3748
    0.333333333333
3749
    >>> round(dist_indel('Colin', 'Cuilen'), 12)
3750
    0.454545454545
3751
    >>> dist_indel('ATCG', 'TAGC')
3752
    0.5
3753
    """
3754
    if src == tar:
3755
        return 0
3756
    return (levenshtein(src, tar, mode='lev', cost=(1, 1, 9999, 9999)) /
3757
            (len(src) + len(tar)))
3758
3759
3760
def sim_indel(src, tar):
3761
    """Return the indel similarity of two strings.
3762
3763
    This is equivalent to levenshtein similarity, when only inserts and deletes
3764
    are possible.
3765
3766
    :param str src: source string for comparison
3767
    :param str tar: target string for comparison
3768
    :returns: indel similarity
3769
    :rtype: float
3770
3771
    >>> round(sim_indel('cat', 'hat'), 12)
3772
    0.666666666667
3773
    >>> round(sim_indel('Niall', 'Neil'), 12)
3774
    0.666666666667
3775
    >>> round(sim_indel('Colin', 'Cuilen'), 12)
3776
    0.545454545455
3777
    >>> sim_indel('ATCG', 'TAGC')
3778
    0.5
3779
    """
3780
    return 1-dist_indel(src, tar)
3781
3782
3783
def _synoname_strip_punct(word):
3784
    """Return a word with punctuation stripped out.
3785
3786
    :param word: a word to strip punctuation from
3787
    :returns: The word stripped of punctuation
3788
3789
    >>> _synoname_strip_punct('AB;CD EF-GH$IJ')
3790
    'ABCD EFGHIJ'
3791
    """
3792
    stripped = ''
3793
    for char in word:
3794
        if char not in set(',-./:;"&\'()!{|}?$%*+<=>[\\]^_`~'):
3795
            stripped += char
3796
    return stripped.strip()
3797
3798
3799
def _synoname_word_approximation(src_ln, tar_ln, src_fn='', tar_fn='',
3800
                                 features=None):
3801
    """Return the Synoname word approximation score for two names.
3802
3803
    :param str src_ln: last name of the source
3804
    :param str tar_ln: last name of the target
3805
    :param str src_fn: first name of the source (optional)
3806
    :param str tar_fn: first name of the target (optional)
3807
    :param features: a dict containing special features calculated via
3808
        fingerprint.synoname_toolcode() (optional)
3809
    :returns: The word approximation score
3810
    :rtype: float
3811
3812
    >>> _synoname_word_approximation('Smith Waterman', 'Waterman',
3813
    ... 'Tom Joe Bob', 'Tom Joe')
3814
    0.6
3815
    """
3816
    if features is None:
3817
        features = {}
3818
    if 'src_specials' not in features:
3819
        features['src_specials'] = []
3820
    if 'tar_specials' not in features:
3821
        features['tar_specials'] = []
3822
3823
    src_len_specials = len(features['src_specials'])
3824
    tar_len_specials = len(features['tar_specials'])
3825
3826
    # 1
3827
    if (('gen_conflict' in features and features['gen_conflict']) or
3828
            ('roman_conflict' in features and features['roman_conflict'])):
3829
        return 0
3830
3831
    # 3 & 7
3832
    full_tar1 = ' '.join((tar_ln, tar_fn)).replace('-', ' ').strip()
3833
    for s_pos, s_type in features['tar_specials']:
3834
        if s_type == 'a':
3835
            full_tar1 = full_tar1[:-(1+len(_synoname_special_table[s_pos][1]))]
3836
        elif s_type == 'b':
3837
            loc = full_tar1.find(' '+_synoname_special_table[s_pos][1]+' ')+1
3838
            full_tar1 = (full_tar1[:loc] +
3839
                         full_tar1[loc +
3840
                                   len(_synoname_special_table[s_pos][1]):])
3841
        elif s_type == 'c':
3842
            full_tar1 = full_tar1[1+len(_synoname_special_table[s_pos][1]):]
3843
3844
    full_src1 = ' '.join((src_ln, src_fn)).replace('-', ' ').strip()
3845
    for s_pos, s_type in features['src_specials']:
3846
        if s_type == 'a':
3847
            full_src1 = full_src1[:-(1+len(_synoname_special_table[s_pos][1]))]
3848
        elif s_type == 'b':
3849
            loc = full_src1.find(' '+_synoname_special_table[s_pos][1]+' ')+1
3850
            full_src1 = (full_src1[:loc] +
3851
                         full_src1[loc +
3852
                                   len(_synoname_special_table[s_pos][1]):])
3853
        elif s_type == 'c':
3854
            full_src1 = full_src1[1+len(_synoname_special_table[s_pos][1]):]
3855
3856
    full_tar2 = full_tar1
3857
    for s_pos, s_type in features['tar_specials']:
3858
        if s_type == 'd':
3859
            full_tar2 = full_tar2[len(_synoname_special_table[s_pos][1]):]
3860
        elif s_type == 'X' and _synoname_special_table[s_pos][1] in full_tar2:
3861
            loc = full_tar2.find(' '+_synoname_special_table[s_pos][1])
3862
            full_tar2 = (full_tar2[:loc] +
3863
                         full_tar2[loc +
3864
                                   len(_synoname_special_table[s_pos][1]):])
3865
3866
    full_src2 = full_src1
3867
    for s_pos, s_type in features['src_specials']:
3868
        if s_type == 'd':
3869
            full_src2 = full_src2[len(_synoname_special_table[s_pos][1]):]
3870
        elif s_type == 'X' and _synoname_special_table[s_pos][1] in full_src2:
3871
            loc = full_src2.find(' '+_synoname_special_table[s_pos][1])
3872
            full_src2 = (full_src2[:loc] +
3873
                         full_src2[loc +
3874
                                   len(_synoname_special_table[s_pos][1]):])
3875
3876
    full_tar1 = _synoname_strip_punct(full_tar1)
3877
    tar1_words = full_tar1.split()
3878
    tar1_num_words = len(tar1_words)
3879
3880
    full_src1 = _synoname_strip_punct(full_src1)
3881
    src1_words = full_src1.split()
3882
    src1_num_words = len(src1_words)
3883
3884
    full_tar2 = _synoname_strip_punct(full_tar2)
3885
    tar2_words = full_tar2.split()
3886
    tar2_num_words = len(tar2_words)
3887
3888
    full_src2 = _synoname_strip_punct(full_src2)
3889
    src2_words = full_src2.split()
3890
    src2_num_words = len(src2_words)
3891
3892
    # 2
3893
    if (src1_num_words < 2 and src_len_specials == 0 and src2_num_words < 2 and
3894
            tar_len_specials == 0):
3895
        return 0
3896
3897
    # 4
3898
    if (tar1_num_words == 1 and src1_num_words == 1 and
3899
            tar1_words[0] == src1_words[0]):
3900
        return 1
3901
    if tar1_num_words < 2 and tar_len_specials == 0:
3902
        return 0
3903
3904
    # 5
3905
    last_found = False
3906
    for word in tar1_words:
3907
        if src_ln.endswith(word) or word+' ' in src_ln:
3908
            last_found = True
3909
3910
    if not last_found:
3911
        for word in src1_words:
3912
            if tar_ln.endswith(word) or word+' ' in tar_ln:
3913
                last_found = True
3914
3915
    # 6
3916
    matches = 0
3917
    if last_found:
3918
        for i, s_word in enumerate(src1_words):
3919
            for j, t_word in enumerate(tar1_words):
3920
                if s_word == t_word:
3921
                    src1_words[i] = '@'
3922
                    tar1_words[j] = '@'
3923
                    matches += 1
3924
    w_ratio = matches/max(tar1_num_words, src1_num_words)
3925
    if matches > 1 or (matches == 1 and
3926
                       src1_num_words == 1 and tar1_num_words == 1 and
3927
                       (tar_len_specials > 0 or src_len_specials > 0)):
3928
        return w_ratio
3929
3930
    # 8
3931
    if (tar2_num_words == 1 and src2_num_words == 1 and
3932
            tar2_words[0] == src2_words[0]):
3933
        return 1
3934
    # I see no way that the following can be True if the equivalent in
3935
    # #4 was False.
3936
    if tar2_num_words < 2 and tar_len_specials == 0:  # pragma: no cover
3937
        return 0
3938
3939
    # 9
3940
    last_found = False
3941
    for word in tar2_words:
3942
        if src_ln.endswith(word) or word+' ' in src_ln:
3943
            last_found = True
3944
3945
    if not last_found:
3946
        for word in src2_words:
3947
            if tar_ln.endswith(word) or word+' ' in tar_ln:
3948
                last_found = True
3949
3950
    if not last_found:
3951
        return 0
3952
3953
    # 10
3954
    matches = 0
3955
    if last_found:
3956
        for i, s_word in enumerate(src2_words):
3957
            for j, t_word in enumerate(tar2_words):
3958
                if s_word == t_word:
3959
                    src2_words[i] = '@'
3960
                    tar2_words[j] = '@'
3961
                    matches += 1
3962
    w_ratio = matches/max(tar2_num_words, src2_num_words)
3963
    if matches > 1 or (matches == 1 and
3964
                       src2_num_words == 1 and tar2_num_words == 1 and
3965
                       (tar_len_specials > 0 or src_len_specials > 0)):
3966
        return w_ratio
3967
3968
    return 0
3969
3970
3971
def synoname(src, tar, word_approx_min=0.3, char_approx_min=0.73,
3972
             tests=2**12-1, ret_name=False):
3973
    """Return the Synoname similarity type of two words.
3974
3975
    Cf. :cite:`Getty:1991,Gross:1991`
3976
3977
    :param str src: source string for comparison
3978
    :param str tar: target string for comparison
3979
    :param bool ret_name: return the name of the match type rather than the
3980
        int value
3981
    :param float word_approx_min: the minimum word approximation value to
3982
        signal a 'word_approx' match
3983
    :param float char_approx_min: the minimum character approximation value to
3984
        signal a 'char_approx' match
3985
    :param int or Iterable tests: either an integer indicating tests to
3986
        perform or a list of test names to perform (defaults to performing all
3987
        tests)
3988
    :param bool ret_name: if True, returns the match name rather than its
3989
        integer equivalent
3990
    :returns: Synoname value
3991
    :rtype: int (or str if ret_name is True)
3992
3993
    >>> synoname(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''))
3994
    2
3995
    >>> synoname(('Breghel', 'Pieter', ''), ('Brueghel', 'Pieter', ''),
3996
    ... ret_name=True)
3997
    'omission'
3998
    >>> synoname(('Dore', 'Gustave', ''),
3999
    ... ('Dore', 'Paul Gustave Louis Christophe', ''),
4000
    ... ret_name=True)
4001
    'inclusion'
4002
    >>> synoname(('Pereira', 'I. R.', ''), ('Pereira', 'I. Smith', ''),
4003
    ... ret_name=True)
4004
    'word_approx'
4005
    """
4006
    test_dict = {val: 2**n for n, val in enumerate([
4007
        'exact', 'omission', 'substitution', 'transposition', 'punctuation',
4008
        'initials', 'extension', 'inclusion', 'no_first', 'word_approx',
4009
        'confusions', 'char_approx'])}
4010
    match_name = ['', 'exact', 'omission', 'substitution', 'transposition',
4011
                  'punctuation', 'initials', 'extension', 'inclusion',
4012
                  'no_first', 'word_approx', 'confusions', 'char_approx',
4013
                  'no_match']
4014
    match_type_dict = {val: n for n, val in enumerate(match_name)}
4015
4016
    if isinstance(tests, Iterable):
4017
        new_tests = 0
4018
        for term in tests:
4019
            if term in test_dict:
4020
                new_tests += test_dict[term]
4021
        tests = new_tests
4022
4023
    if isinstance(src, tuple):
4024
        src_ln, src_fn, src_qual = src
4025
    elif '#' in src:
4026
        src_ln, src_fn, src_qual = src.split('#')[-3:]
4027
    else:
4028
        src_ln, src_fn, src_qual = src, '', ''
4029
4030
    if isinstance(tar, tuple):
4031
        tar_ln, tar_fn, tar_qual = tar
4032
    elif '#' in tar:
4033
        tar_ln, tar_fn, tar_qual = tar.split('#')[-3:]
4034
    else:
4035
        tar_ln, tar_fn, tar_qual = tar, '', ''
4036
4037
    def _split_special(spec):
4038
        spec_list = []
4039
        while spec:
4040
            spec_list.append((int(spec[:3]), spec[3:4]))
4041
            spec = spec[4:]
4042
        return spec_list
4043
4044
    def _fmt_retval(val):
4045
        if ret_name:
4046
            return match_name[val]
4047
        return val
4048
4049
    # 1. Preprocessing
4050
4051
    # Lowercasing
4052
    src_fn = src_fn.strip().lower()
4053
    src_ln = src_ln.strip().lower()
4054
    src_qual = src_qual.strip().lower()
4055
4056
    tar_fn = tar_fn.strip().lower()
4057
    tar_ln = tar_ln.strip().lower()
4058
    tar_qual = tar_qual.strip().lower()
4059
4060
    # Create toolcodes
4061
    src_ln, src_fn, src_tc = synoname_toolcode(src_ln, src_fn, src_qual)
4062
    tar_ln, tar_fn, tar_tc = synoname_toolcode(tar_ln, tar_fn, tar_qual)
4063
4064
    src_generation = int(src_tc[2])
4065
    src_romancode = int(src_tc[3:6])
4066
    src_len_fn = int(src_tc[6:8])
4067
    src_tc = src_tc.split('$')
4068
    src_specials = _split_special(src_tc[1])
4069
4070
    tar_generation = int(tar_tc[2])
4071
    tar_romancode = int(tar_tc[3:6])
4072
    tar_len_fn = int(tar_tc[6:8])
4073
    tar_tc = tar_tc.split('$')
4074
    tar_specials = _split_special(tar_tc[1])
4075
4076
    gen_conflict = ((src_generation != tar_generation) and
4077
                    bool(src_generation or tar_generation))
4078
    roman_conflict = ((src_romancode != tar_romancode) and
4079
                      bool(src_romancode or tar_romancode))
4080
4081
    ln_equal = src_ln == tar_ln
4082
    fn_equal = src_fn == tar_fn
4083
4084
    # approx_c
4085
    def _approx_c():
4086
        if gen_conflict or roman_conflict:
4087
            return False, 0
4088
4089
        full_src = ' '.join((src_ln, src_fn))
4090
        if full_src.startswith('master '):
4091
            full_src = full_src[len('master '):]
4092
            for intro in ['of the ', 'of ', 'known as the ', 'with the ',
4093
                          'with ']:
4094
                if full_src.startswith(intro):
4095
                    full_src = full_src[len(intro):]
4096
4097
        full_tar = ' '.join((tar_ln, tar_fn))
4098
        if full_tar.startswith('master '):
4099
            full_tar = full_tar[len('master '):]
4100
            for intro in ['of the ', 'of ', 'known as the ', 'with the ',
4101
                          'with ']:
4102
                if full_tar.startswith(intro):
4103
                    full_tar = full_tar[len(intro):]
4104
4105
        loc_ratio = sim_ratcliff_obershelp(full_src, full_tar)
4106
        return loc_ratio >= char_approx_min, loc_ratio
4107
4108
    approx_c_result, ca_ratio = _approx_c()
4109
4110
    if tests & test_dict['exact'] and fn_equal and ln_equal:
4111
        return _fmt_retval(match_type_dict['exact'])
4112
    if tests & test_dict['omission']:
4113
        if (fn_equal and
4114
                levenshtein(src_ln, tar_ln, cost=(1, 1, 99, 99)) == 1):
4115
            if not roman_conflict:
4116
                return _fmt_retval(match_type_dict['omission'])
4117
        elif (ln_equal and
4118
              levenshtein(src_fn, tar_fn, cost=(1, 1, 99, 99)) == 1):
4119
            return _fmt_retval(match_type_dict['omission'])
4120
    if tests & test_dict['substitution']:
4121
        if (fn_equal and
4122
                levenshtein(src_ln, tar_ln, cost=(99, 99, 1, 99)) == 1):
4123
            return _fmt_retval(match_type_dict['substitution'])
4124
        elif (ln_equal and
4125
              levenshtein(src_fn, tar_fn, cost=(99, 99, 1, 99)) == 1):
4126
            return _fmt_retval(match_type_dict['substitution'])
4127
    if tests & test_dict['transposition']:
4128
        if (fn_equal and
4129
                (levenshtein(src_ln, tar_ln, mode='osa', cost=(99, 99, 99, 1))
4130
                 == 1)):
4131
            return _fmt_retval(match_type_dict['transposition'])
4132
        elif (ln_equal and
4133
              (levenshtein(src_fn, tar_fn, mode='osa', cost=(99, 99, 99, 1))
4134
               == 1)):
4135
            return _fmt_retval(match_type_dict['transposition'])
4136
    if tests & test_dict['punctuation']:
4137
        np_src_fn = _synoname_strip_punct(src_fn)
4138
        np_tar_fn = _synoname_strip_punct(tar_fn)
4139
        np_src_ln = _synoname_strip_punct(src_ln)
4140
        np_tar_ln = _synoname_strip_punct(tar_ln)
4141
4142
        if (np_src_fn == np_tar_fn) and (np_src_ln == np_tar_ln):
4143
            return _fmt_retval(match_type_dict['punctuation'])
4144
4145
        np_src_fn = _synoname_strip_punct(src_fn.replace('-', ' '))
4146
        np_tar_fn = _synoname_strip_punct(tar_fn.replace('-', ' '))
4147
        np_src_ln = _synoname_strip_punct(src_ln.replace('-', ' '))
4148
        np_tar_ln = _synoname_strip_punct(tar_ln.replace('-', ' '))
4149
4150
        if (np_src_fn == np_tar_fn) and (np_src_ln == np_tar_ln):
4151
            return _fmt_retval(match_type_dict['punctuation'])
4152
4153
    if tests & test_dict['initials'] and ln_equal:
4154
        if src_fn and tar_fn:
4155
            src_initials = _synoname_strip_punct(src_fn).split()
4156
            tar_initials = _synoname_strip_punct(tar_fn).split()
4157
            initials = bool((len(src_initials) == len(''.join(src_initials)))
4158
                            or
4159
                            (len(tar_initials) == len(''.join(tar_initials))))
4160
            if initials:
4161
                src_initials = ''.join(_[0] for _ in src_initials)
4162
                tar_initials = ''.join(_[0] for _ in tar_initials)
4163
                if src_initials == tar_initials:
4164
                    return _fmt_retval(match_type_dict['initials'])
4165
                initial_diff = abs(len(src_initials)-len(tar_initials))
4166
                if (initial_diff and
4167
                        ((initial_diff ==
4168
                          levenshtein(src_initials, tar_initials,
4169
                                      cost=(1, 99, 99, 99))) or
4170
                         (initial_diff ==
4171
                          levenshtein(tar_initials, src_initials,
4172
                                      cost=(1, 99, 99, 99))))):
4173
                    return _fmt_retval(match_type_dict['initials'])
4174
    if tests & test_dict['extension']:
4175
        if src_ln[1] == tar_ln[1] and (src_ln.startswith(tar_ln) or
4176
                                       tar_ln.startswith(src_ln)):
4177
            if (((not src_len_fn and not tar_len_fn) or
4178
                 (tar_fn and src_fn.startswith(tar_fn)) or
4179
                 (src_fn and tar_fn.startswith(src_fn)))
4180
                    and not roman_conflict):
4181
                return _fmt_retval(match_type_dict['extension'])
4182
    if tests & test_dict['inclusion'] and ln_equal:
4183
        if (src_fn and src_fn in tar_fn) or (tar_fn and tar_fn in src_ln):
4184
            return _fmt_retval(match_type_dict['inclusion'])
4185
    if tests & test_dict['no_first'] and ln_equal:
4186
        if src_fn == '' or tar_fn == '':
4187
            return _fmt_retval(match_type_dict['no_first'])
4188
    if tests & test_dict['word_approx']:
4189
        ratio = _synoname_word_approximation(src_ln, tar_ln, src_fn, tar_fn,
4190
                                             {'gen_conflict': gen_conflict,
4191
                                              'roman_conflict': roman_conflict,
4192
                                              'src_specials': src_specials,
4193
                                              'tar_specials': tar_specials})
4194
        if ratio == 1 and tests & test_dict['confusions']:
4195
            if (' '.join((src_fn, src_ln)).strip() ==
4196
                    ' '.join((tar_fn, tar_ln)).strip()):
4197
                return _fmt_retval(match_type_dict['confusions'])
4198
        if ratio >= word_approx_min:
4199
            return _fmt_retval(match_type_dict['word_approx'])
4200
    if tests & test_dict['char_approx']:
4201
        if ca_ratio >= char_approx_min:
4202
            return _fmt_retval(match_type_dict['char_approx'])
4203
    return _fmt_retval(match_type_dict['no_match'])
4204
4205
4206
###############################################################################
4207
4208
4209
def sim(src, tar, method=sim_levenshtein):
4210
    """Return a similarity of two strings.
4211
4212
    This is a generalized function for calling other similarity functions.
4213
4214
    :param str src: source string for comparison
4215
    :param str tar: target string for comparison
4216
    :param function method: specifies the similarity metric (Levenshtein by
4217
        default)
4218
    :returns: similarity according to the specified function
4219
    :rtype: float
4220
4221
    >>> round(sim('cat', 'hat'), 12)
4222
    0.666666666667
4223
    >>> round(sim('Niall', 'Neil'), 12)
4224
    0.4
4225
    >>> sim('aluminum', 'Catalan')
4226
    0.125
4227
    >>> sim('ATCG', 'TAGC')
4228
    0.25
4229
    """
4230
    if callable(method):
4231
        return method(src, tar)
4232
    else:
4233
        raise AttributeError('Unknown similarity function: ' + str(method))
4234
4235
4236
def dist(src, tar, method=sim_levenshtein):
4237
    """Return a distance between two strings.
4238
4239
    This is a generalized function for calling other distance functions.
4240
4241
    :param str src: source string for comparison
4242
    :param str tar: target string for comparison
4243
    :param function method: specifies the similarity metric (Levenshtein by
4244
        default) -- Note that this takes a similarity metric function, not
4245
        a distance metric function.
4246
    :returns: distance according to the specified function
4247
    :rtype: float
4248
4249
    >>> round(dist('cat', 'hat'), 12)
4250
    0.333333333333
4251
    >>> round(dist('Niall', 'Neil'), 12)
4252
    0.6
4253
    >>> dist('aluminum', 'Catalan')
4254
    0.875
4255
    >>> dist('ATCG', 'TAGC')
4256
    0.75
4257
    """
4258
    if callable(method):
4259
        return 1 - method(src, tar)
4260
    else:
4261
        raise AttributeError('Unknown distance function: ' + str(method))
4262
4263
4264
if __name__ == '__main__':
4265
    import doctest
4266
    doctest.testmod()
4267