1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
|
3
|
|
|
# Copyright 2014-2018 by Christopher C. Little. |
4
|
|
|
# This file is part of Abydos. |
5
|
|
|
# |
6
|
|
|
# Abydos is free software: you can redistribute it and/or modify |
7
|
|
|
# it under the terms of the GNU General Public License as published by |
8
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
9
|
|
|
# (at your option) any later version. |
10
|
|
|
# |
11
|
|
|
# Abydos is distributed in the hope that it will be useful, |
12
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
13
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14
|
|
|
# GNU General Public License for more details. |
15
|
|
|
# |
16
|
|
|
# You should have received a copy of the GNU General Public License |
17
|
|
|
# along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
18
|
|
|
|
19
|
1 |
|
"""abydos.distance._editex. |
20
|
|
|
|
21
|
|
|
editex |
22
|
|
|
""" |
23
|
|
|
|
24
|
1 |
|
from __future__ import ( |
25
|
|
|
absolute_import, |
26
|
|
|
division, |
27
|
|
|
print_function, |
28
|
|
|
unicode_literals, |
29
|
|
|
) |
30
|
|
|
|
31
|
1 |
|
from unicodedata import normalize as unicode_normalize |
32
|
|
|
|
33
|
1 |
|
from numpy import int as np_int |
34
|
1 |
|
from numpy import zeros as np_zeros |
35
|
|
|
|
36
|
1 |
|
from six import text_type |
37
|
1 |
|
from six.moves import range |
38
|
|
|
|
39
|
1 |
|
from ._distance import _Distance |
40
|
|
|
|
41
|
1 |
|
__all__ = ['Editex', 'dist_editex', 'editex', 'sim_editex'] |
42
|
|
|
|
43
|
|
|
|
44
|
1 |
|
class Editex(_Distance): |
|
|
|
|
45
|
|
|
"""Editex. |
46
|
|
|
|
47
|
|
|
As described on pages 3 & 4 of :cite:`Zobel:1996`. |
48
|
|
|
|
49
|
|
|
The local variant is based on :cite:`Ring:2009`. |
50
|
|
|
""" |
51
|
|
|
|
52
|
1 |
|
_letter_groups = ( |
53
|
|
|
frozenset('AEIOUY'), |
54
|
|
|
frozenset('BP'), |
55
|
|
|
frozenset('CKQ'), |
56
|
|
|
frozenset('DT'), |
57
|
|
|
frozenset('LR'), |
58
|
|
|
frozenset('MN'), |
59
|
|
|
frozenset('GJ'), |
60
|
|
|
frozenset('FPV'), |
61
|
|
|
frozenset('SXZ'), |
62
|
|
|
) |
63
|
|
|
|
64
|
1 |
|
_all_letters = frozenset('ABCDEFGIJKLMNOPQRSTUVXYZ') |
65
|
|
|
|
66
|
1 |
|
def dist_abs(self, src, tar, cost=(0, 1, 2), local=False): |
|
|
|
|
67
|
|
|
"""Return the Editex distance between two strings. |
68
|
|
|
|
69
|
|
|
Parameters |
70
|
|
|
---------- |
71
|
|
|
src : str |
72
|
|
|
Source string for comparison |
73
|
|
|
tar : str |
74
|
|
|
Target string for comparison |
75
|
|
|
cost : tuple |
76
|
|
|
A 3-tuple representing the cost of the four possible edits: match, |
77
|
|
|
same-group, and mismatch respectively (by default: (0, 1, 2)) |
78
|
|
|
local : bool |
79
|
|
|
If True, the local variant of Editex is used |
80
|
|
|
|
81
|
|
|
Returns |
82
|
|
|
------- |
83
|
|
|
int |
84
|
|
|
Editex distance |
85
|
|
|
|
86
|
|
|
Examples |
87
|
|
|
-------- |
88
|
|
|
>>> cmp = Editex() |
89
|
|
|
>>> cmp.dist_abs('cat', 'hat') |
90
|
|
|
2 |
91
|
|
|
>>> cmp.dist_abs('Niall', 'Neil') |
92
|
|
|
2 |
93
|
|
|
>>> cmp.dist_abs('aluminum', 'Catalan') |
94
|
|
|
12 |
95
|
|
|
>>> cmp.dist_abs('ATCG', 'TAGC') |
96
|
|
|
6 |
97
|
|
|
|
98
|
|
|
""" |
99
|
1 |
|
match_cost, group_cost, mismatch_cost = cost |
100
|
|
|
|
101
|
1 |
|
def r_cost(ch1, ch2): |
102
|
|
|
"""Return r(a,b) according to Zobel & Dart's definition. |
103
|
|
|
|
104
|
|
|
Parameters |
105
|
|
|
---------- |
106
|
|
|
ch1 : str |
107
|
|
|
The first character to compare |
108
|
|
|
ch2 : str |
109
|
|
|
The second character to compare |
110
|
|
|
|
111
|
|
|
Returns |
112
|
|
|
------- |
113
|
|
|
int |
114
|
|
|
r(a,b) according to Zobel & Dart's definition |
115
|
|
|
|
116
|
|
|
""" |
117
|
1 |
|
if ch1 == ch2: |
118
|
1 |
|
return match_cost |
119
|
1 |
|
if ch1 in self._all_letters and ch2 in self._all_letters: |
120
|
1 |
|
for group in self._letter_groups: |
121
|
1 |
|
if ch1 in group and ch2 in group: |
122
|
1 |
|
return group_cost |
123
|
1 |
|
return mismatch_cost |
124
|
|
|
|
125
|
1 |
|
def d_cost(ch1, ch2): |
126
|
|
|
"""Return d(a,b) according to Zobel & Dart's definition. |
127
|
|
|
|
128
|
|
|
Parameters |
129
|
|
|
---------- |
130
|
|
|
ch1 : str |
131
|
|
|
The first character to compare |
132
|
|
|
ch2 : str |
133
|
|
|
The second character to compare |
134
|
|
|
|
135
|
|
|
Returns |
136
|
|
|
------- |
137
|
|
|
int |
138
|
|
|
d(a,b) according to Zobel & Dart's definition |
139
|
|
|
|
140
|
|
|
""" |
141
|
1 |
|
if ch1 != ch2 and (ch1 == 'H' or ch1 == 'W'): |
142
|
1 |
|
return group_cost |
143
|
1 |
|
return r_cost(ch1, ch2) |
144
|
|
|
|
145
|
|
|
# convert both src & tar to NFKD normalized unicode |
146
|
1 |
|
src = unicode_normalize('NFKD', text_type(src.upper())) |
147
|
1 |
|
tar = unicode_normalize('NFKD', text_type(tar.upper())) |
148
|
|
|
# convert ß to SS (for Python2) |
149
|
1 |
|
src = src.replace('ß', 'SS') |
150
|
1 |
|
tar = tar.replace('ß', 'SS') |
151
|
|
|
|
152
|
1 |
|
if src == tar: |
153
|
1 |
|
return 0.0 |
154
|
1 |
|
if not src: |
155
|
1 |
|
return len(tar) * mismatch_cost |
156
|
1 |
|
if not tar: |
157
|
1 |
|
return len(src) * mismatch_cost |
158
|
|
|
|
159
|
1 |
|
d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int) |
160
|
1 |
|
lens = len(src) |
161
|
1 |
|
lent = len(tar) |
162
|
1 |
|
src = ' ' + src |
163
|
1 |
|
tar = ' ' + tar |
164
|
|
|
|
165
|
1 |
|
if not local: |
166
|
1 |
|
for i in range(1, lens + 1): |
167
|
1 |
|
d_mat[i, 0] = d_mat[i - 1, 0] + d_cost(src[i - 1], src[i]) |
168
|
1 |
|
for j in range(1, lent + 1): |
169
|
1 |
|
d_mat[0, j] = d_mat[0, j - 1] + d_cost(tar[j - 1], tar[j]) |
170
|
|
|
|
171
|
1 |
|
for i in range(1, lens + 1): |
172
|
1 |
|
for j in range(1, lent + 1): |
173
|
1 |
|
d_mat[i, j] = min( |
174
|
|
|
d_mat[i - 1, j] + d_cost(src[i - 1], src[i]), |
175
|
|
|
d_mat[i, j - 1] + d_cost(tar[j - 1], tar[j]), |
176
|
|
|
d_mat[i - 1, j - 1] + r_cost(src[i], tar[j]), |
177
|
|
|
) |
178
|
|
|
|
179
|
1 |
|
return d_mat[lens, lent] |
180
|
|
|
|
181
|
1 |
|
def dist(self, src, tar, cost=(0, 1, 2), local=False): |
|
|
|
|
182
|
|
|
"""Return the normalized Editex distance between two strings. |
183
|
|
|
|
184
|
|
|
The Editex distance is normalized by dividing the Editex distance |
185
|
|
|
(calculated by any of the three supported methods) by the greater of |
186
|
|
|
the number of characters in src times the cost of a delete and |
187
|
|
|
the number of characters in tar times the cost of an insert. |
188
|
|
|
For the case in which all operations have :math:`cost = 1`, this is |
189
|
|
|
equivalent to the greater of the length of the two strings src & tar. |
190
|
|
|
|
191
|
|
|
Parameters |
192
|
|
|
---------- |
193
|
|
|
src : str |
194
|
|
|
Source string for comparison |
195
|
|
|
tar : str |
196
|
|
|
Target string for comparison |
197
|
|
|
cost : tuple |
198
|
|
|
A 3-tuple representing the cost of the four possible edits: match, |
199
|
|
|
same-group, and mismatch respectively (by default: (0, 1, 2)) |
200
|
|
|
local : bool |
201
|
|
|
If True, the local variant of Editex is used |
202
|
|
|
|
203
|
|
|
Returns |
204
|
|
|
------- |
205
|
|
|
int |
206
|
|
|
Normalized Editex distance |
207
|
|
|
|
208
|
|
|
Examples |
209
|
|
|
-------- |
210
|
|
|
>>> cmp = Editex() |
211
|
|
|
>>> round(cmp.dist('cat', 'hat'), 12) |
212
|
|
|
0.333333333333 |
213
|
|
|
>>> round(cmp.dist('Niall', 'Neil'), 12) |
214
|
|
|
0.2 |
215
|
|
|
>>> cmp.dist('aluminum', 'Catalan') |
216
|
|
|
0.75 |
217
|
|
|
>>> cmp.dist('ATCG', 'TAGC') |
218
|
|
|
0.75 |
219
|
|
|
|
220
|
|
|
""" |
221
|
1 |
|
if src == tar: |
222
|
1 |
|
return 0.0 |
223
|
1 |
|
mismatch_cost = cost[2] |
224
|
1 |
|
return self.dist_abs(src, tar, cost, local) / ( |
225
|
|
|
max(len(src) * mismatch_cost, len(tar) * mismatch_cost) |
226
|
|
|
) |
227
|
|
|
|
228
|
|
|
|
229
|
1 |
|
def editex(src, tar, cost=(0, 1, 2), local=False): |
230
|
|
|
"""Return the Editex distance between two strings. |
231
|
|
|
|
232
|
|
|
This is a wrapper for :py:meth:`Editex.dist_abs`. |
233
|
|
|
|
234
|
|
|
Parameters |
235
|
|
|
---------- |
236
|
|
|
src : str |
237
|
|
|
Source string for comparison |
238
|
|
|
tar : str |
239
|
|
|
Target string for comparison |
240
|
|
|
cost : tuple |
241
|
|
|
A 3-tuple representing the cost of the four possible edits: match, |
242
|
|
|
same-group, and mismatch respectively (by default: (0, 1, 2)) |
243
|
|
|
local : bool |
244
|
|
|
If True, the local variant of Editex is used |
245
|
|
|
|
246
|
|
|
Returns |
247
|
|
|
------- |
248
|
|
|
int |
249
|
|
|
Editex distance |
250
|
|
|
|
251
|
|
|
Examples |
252
|
|
|
-------- |
253
|
|
|
>>> editex('cat', 'hat') |
254
|
|
|
2 |
255
|
|
|
>>> editex('Niall', 'Neil') |
256
|
|
|
2 |
257
|
|
|
>>> editex('aluminum', 'Catalan') |
258
|
|
|
12 |
259
|
|
|
>>> editex('ATCG', 'TAGC') |
260
|
|
|
6 |
261
|
|
|
|
262
|
|
|
""" |
263
|
1 |
|
return Editex().dist_abs(src, tar, cost, local) |
264
|
|
|
|
265
|
|
|
|
266
|
1 |
|
def dist_editex(src, tar, cost=(0, 1, 2), local=False): |
267
|
|
|
"""Return the normalized Editex distance between two strings. |
268
|
|
|
|
269
|
|
|
This is a wrapper for :py:meth:`Editex.dist`. |
270
|
|
|
|
271
|
|
|
Parameters |
272
|
|
|
---------- |
273
|
|
|
src : str |
274
|
|
|
Source string for comparison |
275
|
|
|
tar : str |
276
|
|
|
Target string for comparison |
277
|
|
|
cost : tuple |
278
|
|
|
A 3-tuple representing the cost of the four possible edits: match, |
279
|
|
|
same-group, and mismatch respectively (by default: (0, 1, 2)) |
280
|
|
|
local : bool |
281
|
|
|
If True, the local variant of Editex is used |
282
|
|
|
|
283
|
|
|
Returns |
284
|
|
|
------- |
285
|
|
|
int |
286
|
|
|
Normalized Editex distance |
287
|
|
|
|
288
|
|
|
Examples |
289
|
|
|
-------- |
290
|
|
|
>>> round(dist_editex('cat', 'hat'), 12) |
291
|
|
|
0.333333333333 |
292
|
|
|
>>> round(dist_editex('Niall', 'Neil'), 12) |
293
|
|
|
0.2 |
294
|
|
|
>>> dist_editex('aluminum', 'Catalan') |
295
|
|
|
0.75 |
296
|
|
|
>>> dist_editex('ATCG', 'TAGC') |
297
|
|
|
0.75 |
298
|
|
|
|
299
|
|
|
""" |
300
|
1 |
|
return Editex().dist(src, tar, cost, local) |
301
|
|
|
|
302
|
|
|
|
303
|
1 |
|
def sim_editex(src, tar, cost=(0, 1, 2), local=False): |
304
|
|
|
"""Return the normalized Editex similarity of two strings. |
305
|
|
|
|
306
|
|
|
This is a wrapper for :py:meth:`Editex.sim`. |
307
|
|
|
|
308
|
|
|
Parameters |
309
|
|
|
---------- |
310
|
|
|
src : str |
311
|
|
|
Source string for comparison |
312
|
|
|
tar : str |
313
|
|
|
Target string for comparison |
314
|
|
|
cost : tuple |
315
|
|
|
A 3-tuple representing the cost of the four possible edits: match, |
316
|
|
|
same-group, and mismatch respectively (by default: (0, 1, 2)) |
317
|
|
|
local : bool |
318
|
|
|
If True, the local variant of Editex is used |
319
|
|
|
|
320
|
|
|
Returns |
321
|
|
|
------- |
322
|
|
|
int |
323
|
|
|
Normalized Editex similarity |
324
|
|
|
|
325
|
|
|
Examples |
326
|
|
|
-------- |
327
|
|
|
>>> round(sim_editex('cat', 'hat'), 12) |
328
|
|
|
0.666666666667 |
329
|
|
|
>>> round(sim_editex('Niall', 'Neil'), 12) |
330
|
|
|
0.8 |
331
|
|
|
>>> sim_editex('aluminum', 'Catalan') |
332
|
|
|
0.25 |
333
|
|
|
>>> sim_editex('ATCG', 'TAGC') |
334
|
|
|
0.25 |
335
|
|
|
|
336
|
|
|
""" |
337
|
1 |
|
return Editex().sim(src, tar, cost, local) |
338
|
|
|
|
339
|
|
|
|
340
|
|
|
if __name__ == '__main__': |
341
|
|
|
import doctest |
342
|
|
|
|
343
|
|
|
doctest.testmod() |
344
|
|
|
|