Completed
Branch master (78a222)
by Chris
14:36
created

abydos.distance._editex   A

Complexity

Total Complexity 21

Size/Duplication

Total Lines 222
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
wmc 21
eloc 96
dl 0
loc 222
ccs 55
cts 55
cp 1
rs 10
c 0
b 0
f 0

3 Functions

Rating   Name   Duplication   Size   Complexity  
F editex() 0 116 18
A sim_editex() 0 25 1
A dist_editex() 0 33 2
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance.editex.
20
21
The distance.editex module implements editex functions.
22
"""
23
24 1
from __future__ import division, unicode_literals
25
26 1
from unicodedata import normalize as unicode_normalize
27
28 1
from numpy import int as np_int
29 1
from numpy import zeros as np_zeros
30
31 1
from six import text_type
32 1
from six.moves import range
33
34 1
__all__ = ['dist_editex', 'editex', 'sim_editex']
35
36
37 1
def editex(src, tar, cost=(0, 1, 2), local=False):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (16/15).
Loading history...
38
    """Return the Editex distance between two strings.
39
40
    As described on pages 3 & 4 of :cite:`Zobel:1996`.
41
42
    The local variant is based on :cite:`Ring:2009`.
43
44
    :param str src: source string for comparison
45
    :param str tar: target string for comparison
46
    :param tuple cost: a 3-tuple representing the cost of the four possible
47
        edits:
48
        match, same-group, and mismatch respectively (by default: (0, 1, 2))
49
    :param bool local: if True, the local variant of Editex is used
50
    :returns: Editex distance
51
    :rtype: int
52
53
    >>> editex('cat', 'hat')
54
    2
55
    >>> editex('Niall', 'Neil')
56
    2
57
    >>> editex('aluminum', 'Catalan')
58
    12
59
    >>> editex('ATCG', 'TAGC')
60
    6
61
    """
62 1
    match_cost, group_cost, mismatch_cost = cost
63 1
    letter_groups = (
64
        {'A', 'E', 'I', 'O', 'U', 'Y'},
65
        {'B', 'P'},
66
        {'C', 'K', 'Q'},
67
        {'D', 'T'},
68
        {'L', 'R'},
69
        {'M', 'N'},
70
        {'G', 'J'},
71
        {'F', 'P', 'V'},
72
        {'S', 'X', 'Z'},
73
        {'C', 'S', 'Z'},
74
    )
75 1
    all_letters = {
76
        'A',
77
        'B',
78
        'C',
79
        'D',
80
        'E',
81
        'F',
82
        'G',
83
        'I',
84
        'J',
85
        'K',
86
        'L',
87
        'M',
88
        'N',
89
        'O',
90
        'P',
91
        'Q',
92
        'R',
93
        'S',
94
        'T',
95
        'U',
96
        'V',
97
        'X',
98
        'Y',
99
        'Z',
100
    }
101
102 1
    def r_cost(ch1, ch2):
103
        """Return r(a,b) according to Zobel & Dart's definition."""
104 1
        if ch1 == ch2:
105 1
            return match_cost
106 1
        if ch1 in all_letters and ch2 in all_letters:
107 1
            for group in letter_groups:
108 1
                if ch1 in group and ch2 in group:
109 1
                    return group_cost
110 1
        return mismatch_cost
111
112 1
    def d_cost(ch1, ch2):
113
        """Return d(a,b) according to Zobel & Dart's definition."""
114 1
        if ch1 != ch2 and (ch1 == 'H' or ch1 == 'W'):
115 1
            return group_cost
116 1
        return r_cost(ch1, ch2)
117
118
    # convert both src & tar to NFKD normalized unicode
119 1
    src = unicode_normalize('NFKD', text_type(src.upper()))
120 1
    tar = unicode_normalize('NFKD', text_type(tar.upper()))
121
    # convert ß to SS (for Python2)
122 1
    src = src.replace('ß', 'SS')
123 1
    tar = tar.replace('ß', 'SS')
124
125 1
    if src == tar:
126 1
        return 0
127 1
    if not src:
128 1
        return len(tar) * mismatch_cost
129 1
    if not tar:
130 1
        return len(src) * mismatch_cost
131
132 1
    d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int)
133 1
    lens = len(src)
134 1
    lent = len(tar)
135 1
    src = ' ' + src
136 1
    tar = ' ' + tar
137
138 1
    if not local:
139 1
        for i in range(1, lens + 1):
140 1
            d_mat[i, 0] = d_mat[i - 1, 0] + d_cost(src[i - 1], src[i])
141 1
    for j in range(1, lent + 1):
142 1
        d_mat[0, j] = d_mat[0, j - 1] + d_cost(tar[j - 1], tar[j])
143
144 1
    for i in range(1, lens + 1):
145 1
        for j in range(1, lent + 1):
146 1
            d_mat[i, j] = min(
147
                d_mat[i - 1, j] + d_cost(src[i - 1], src[i]),
148
                d_mat[i, j - 1] + d_cost(tar[j - 1], tar[j]),
149
                d_mat[i - 1, j - 1] + r_cost(src[i], tar[j]),
150
            )
151
152 1
    return d_mat[lens, lent]
153
154
155 1
def dist_editex(src, tar, cost=(0, 1, 2), local=False):
156
    """Return the normalized Editex distance between two strings.
157
158
    The Editex distance is normalized by dividing the Editex distance
159
    (calculated by any of the three supported methods) by the greater of
160
    the number of characters in src times the cost of a delete and
161
    the number of characters in tar times the cost of an insert.
162
    For the case in which all operations have :math:`cost = 1`, this is
163
    equivalent to the greater of the length of the two strings src & tar.
164
165
    :param str src: source string for comparison
166
    :param str tar: target string for comparison
167
    :param tuple cost: a 3-tuple representing the cost of the four possible
168
        edits:
169
        match, same-group, and mismatch respectively (by default: (0, 1, 2))
170
    :param bool local: if True, the local variant of Editex is used
171
    :returns: normalized Editex distance
172
    :rtype: float
173
174
    >>> round(dist_editex('cat', 'hat'), 12)
175
    0.333333333333
176
    >>> round(dist_editex('Niall', 'Neil'), 12)
177
    0.2
178
    >>> dist_editex('aluminum', 'Catalan')
179
    0.75
180
    >>> dist_editex('ATCG', 'TAGC')
181
    0.75
182
    """
183 1
    if src == tar:
184 1
        return 0
185 1
    mismatch_cost = cost[2]
186 1
    return editex(src, tar, cost, local) / (
187
        max(len(src) * mismatch_cost, len(tar) * mismatch_cost)
188
    )
189
190
191 1
def sim_editex(src, tar, cost=(0, 1, 2), local=False):
192
    """Return the normalized Editex similarity of two strings.
193
194
    The Editex similarity is the complement of Editex distance:
195
    :math:`sim_{Editex} = 1 - dist_{Editex}`.
196
197
    :param str src: source string for comparison
198
    :param str tar: target string for comparison
199
    :param tuple cost: a 3-tuple representing the cost of the four possible
200
        edits:
201
        match, same-group, and mismatch respectively (by default: (0, 1, 2))
202
    :param bool local: if True, the local variant of Editex is used
203
    :returns: normalized Editex similarity
204
    :rtype: float
205
206
    >>> round(sim_editex('cat', 'hat'), 12)
207
    0.666666666667
208
    >>> round(sim_editex('Niall', 'Neil'), 12)
209
    0.8
210
    >>> sim_editex('aluminum', 'Catalan')
211
    0.25
212
    >>> sim_editex('ATCG', 'TAGC')
213
    0.25
214
    """
215 1
    return 1 - dist_editex(src, tar, cost, local)
216
217
218
if __name__ == '__main__':
219
    import doctest
220
221
    doctest.testmod()
222