|
1
|
|
|
# -*- coding: utf-8 -*- |
|
2
|
|
|
|
|
3
|
|
|
# Copyright 2018 by Christopher C. Little. |
|
4
|
|
|
# This file is part of Abydos. |
|
5
|
|
|
# |
|
6
|
|
|
# Abydos is free software: you can redistribute it and/or modify |
|
7
|
|
|
# it under the terms of the GNU General Public License as published by |
|
8
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
|
9
|
|
|
# (at your option) any later version. |
|
10
|
|
|
# |
|
11
|
|
|
# Abydos is distributed in the hope that it will be useful, |
|
12
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
13
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
14
|
|
|
# GNU General Public License for more details. |
|
15
|
|
|
# |
|
16
|
|
|
# You should have received a copy of the GNU General Public License |
|
17
|
|
|
# along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
|
18
|
|
|
|
|
19
|
1 |
|
"""abydos.distance.typo. |
|
20
|
|
|
|
|
21
|
|
|
The distance.typo module implements typo edit distance functions. |
|
22
|
|
|
""" |
|
23
|
|
|
|
|
24
|
1 |
|
from __future__ import division, unicode_literals |
|
25
|
|
|
|
|
26
|
1 |
|
from math import log |
|
27
|
|
|
|
|
28
|
1 |
|
from numpy import float32 as np_float32 |
|
29
|
1 |
|
from numpy import zeros as np_zeros |
|
30
|
|
|
|
|
31
|
1 |
|
from six.moves import range |
|
32
|
|
|
|
|
33
|
1 |
|
__all__ = ['dist_typo', 'sim_typo', 'typo'] |
|
34
|
|
|
|
|
35
|
|
|
|
|
36
|
1 |
|
def typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5), layout='QWERTY'): |
|
|
|
|
|
|
37
|
|
|
"""Return the typo distance between two strings. |
|
38
|
|
|
|
|
39
|
|
|
This is inspired by Typo-Distance :cite:`Song:2011`, and a fair bit of |
|
40
|
|
|
this was copied from that module. Compared to the original, this supports |
|
41
|
|
|
different metrics for substitution. |
|
42
|
|
|
|
|
43
|
|
|
:param str src: source string for comparison |
|
44
|
|
|
:param str tar: target string for comparison |
|
45
|
|
|
:param str metric: supported values include: 'euclidean', 'manhattan', |
|
46
|
|
|
'log-euclidean', and 'log-manhattan' |
|
47
|
|
|
:param tuple cost: a 4-tuple representing the cost of the four possible |
|
48
|
|
|
edits: inserts, deletes, substitutions, and shift, respectively (by |
|
49
|
|
|
default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be |
|
50
|
|
|
significantly less than the cost of an insertion & deletion unless |
|
51
|
|
|
a log metric is used. |
|
52
|
|
|
:param str layout: name of the keyboard layout to use (Currently supported: |
|
53
|
|
|
QWERTY, Dvorak, AZERTY, QWERTZ) |
|
54
|
|
|
:returns: typo distance |
|
55
|
|
|
:rtype: float |
|
56
|
|
|
|
|
57
|
|
|
>>> typo('cat', 'hat') |
|
58
|
|
|
1.5811388 |
|
59
|
|
|
>>> typo('Niall', 'Neil') |
|
60
|
|
|
2.8251407 |
|
61
|
|
|
>>> typo('Colin', 'Cuilen') |
|
62
|
|
|
3.4142137 |
|
63
|
|
|
>>> typo('ATCG', 'TAGC') |
|
64
|
|
|
2.5 |
|
65
|
|
|
|
|
66
|
|
|
>>> typo('cat', 'hat', metric='manhattan') |
|
67
|
|
|
2.0 |
|
68
|
|
|
>>> typo('Niall', 'Neil', metric='manhattan') |
|
69
|
|
|
3.0 |
|
70
|
|
|
>>> typo('Colin', 'Cuilen', metric='manhattan') |
|
71
|
|
|
3.5 |
|
72
|
|
|
>>> typo('ATCG', 'TAGC', metric='manhattan') |
|
73
|
|
|
2.5 |
|
74
|
|
|
|
|
75
|
|
|
>>> typo('cat', 'hat', metric='log-manhattan') |
|
76
|
|
|
0.804719 |
|
77
|
|
|
>>> typo('Niall', 'Neil', metric='log-manhattan') |
|
78
|
|
|
2.2424533 |
|
79
|
|
|
>>> typo('Colin', 'Cuilen', metric='log-manhattan') |
|
80
|
|
|
2.2424533 |
|
81
|
|
|
>>> typo('ATCG', 'TAGC', metric='log-manhattan') |
|
82
|
|
|
2.3465736 |
|
83
|
|
|
""" |
|
84
|
1 |
|
ins_cost, del_cost, sub_cost, shift_cost = cost |
|
85
|
|
|
|
|
86
|
1 |
|
if src == tar: |
|
87
|
1 |
|
return 0.0 |
|
88
|
1 |
|
if not src: |
|
89
|
1 |
|
return len(tar) * ins_cost |
|
90
|
1 |
|
if not tar: |
|
91
|
1 |
|
return len(src) * del_cost |
|
92
|
|
|
|
|
93
|
|
|
# fmt: off |
|
94
|
1 |
|
kbs = {'QWERTY': ( |
|
95
|
|
|
(('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '-', '='), |
|
96
|
|
|
('', 'q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '[', ']', |
|
97
|
|
|
'\\'), |
|
98
|
|
|
('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', ';', '\''), |
|
99
|
|
|
('', 'z', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '/')), |
|
100
|
|
|
(('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+'), |
|
101
|
|
|
('', 'Q', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '{', '}', '|'), |
|
102
|
|
|
('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', ':', '"'), |
|
103
|
|
|
('', 'Z', 'X', 'C', 'V', 'B', 'N', 'M', '<', '>', '?')) |
|
104
|
|
|
), 'Dvorak': ( |
|
105
|
|
|
(('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '[', ']'), |
|
106
|
|
|
('', '\'', ',', '.', 'p', 'y', 'f', 'g', 'c', 'r', 'l', '/', '=', |
|
107
|
|
|
'\\'), |
|
108
|
|
|
('', 'a', 'o', 'e', 'u', 'i', 'd', 'h', 't', 'n', 's', '-'), |
|
109
|
|
|
('', ';', 'q', 'j', 'k', 'x', 'b', 'm', 'w', 'v', 'z')), |
|
110
|
|
|
(('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '{', '}'), |
|
111
|
|
|
('', '"', '<', '>', 'P', 'Y', 'F', 'G', 'C', 'R', 'L', '?', '+', '|'), |
|
112
|
|
|
('', 'A', 'O', 'E', 'U', 'I', 'D', 'H', 'T', 'N', 'S', '_'), |
|
113
|
|
|
('', ':', 'Q', 'J', 'K', 'X', 'B', 'M', 'W', 'V', 'Z')) |
|
114
|
|
|
), 'AZERTY': ( |
|
115
|
|
|
(('²', '&', 'é', '"', '\'', '(', '-', 'è', '_', 'ç', 'à', ')', '='), |
|
116
|
|
|
('', 'a', 'z', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '', '$'), |
|
117
|
|
|
('', 'q', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'ù', '*'), |
|
118
|
|
|
('<', 'w', 'x', 'c', 'v', 'b', 'n', ',', ';', ':', '!')), |
|
119
|
|
|
(('~', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '°', '+'), |
|
120
|
|
|
('', 'A', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '', '£'), |
|
121
|
|
|
('', 'Q', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'Ù', 'μ'), |
|
122
|
|
|
('>', 'W', 'X', 'C', 'V', 'B', 'N', '?', '.', '/', '§')) |
|
123
|
|
|
), 'QWERTZ': ( |
|
124
|
|
|
(('', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'ß', ''), |
|
125
|
|
|
('', 'q', 'w', 'e', 'r', 't', 'z', 'u', 'i', 'o', 'p', ' ü', '+', |
|
126
|
|
|
'\\'), |
|
127
|
|
|
('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'ö', 'ä', '#'), |
|
128
|
|
|
('<', 'y', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '-')), |
|
129
|
|
|
(('°', '!', '"', '§', '$', '%', '&', '/', '(', ')', '=', '?', ''), |
|
130
|
|
|
('', 'Q', 'W', 'E', 'R', 'T', 'Z', 'U', 'I', 'O', 'P', 'Ü', '*', ''), |
|
131
|
|
|
('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Ö', 'Ä', '\''), |
|
132
|
|
|
('>', 'Y', 'X', 'C', 'V', 'B', 'N', 'M', ';', ':', '_')) |
|
133
|
|
|
)} |
|
134
|
|
|
# fmt: on |
|
135
|
|
|
|
|
136
|
1 |
|
keyboard = kbs[layout] |
|
137
|
1 |
|
lowercase = {item for sublist in keyboard[0] for item in sublist} |
|
138
|
1 |
|
uppercase = {item for sublist in keyboard[1] for item in sublist} |
|
139
|
|
|
|
|
140
|
1 |
|
def _kb_array_for_char(char): |
|
141
|
|
|
"""Return the keyboard layout that contains ch.""" |
|
142
|
1 |
|
if char in lowercase: |
|
143
|
1 |
|
return keyboard[0] |
|
144
|
1 |
|
elif char in uppercase: |
|
145
|
1 |
|
return keyboard[1] |
|
146
|
1 |
|
raise ValueError(char + ' not found in any keyboard layouts') |
|
147
|
|
|
|
|
148
|
1 |
|
def _get_char_coord(char, kb_array): |
|
149
|
|
|
"""Return the row & column of char in the keyboard.""" |
|
150
|
1 |
|
for row in kb_array: # pragma: no branch |
|
151
|
1 |
|
if char in row: |
|
152
|
1 |
|
return kb_array.index(row), row.index(char) |
|
153
|
|
|
|
|
154
|
1 |
|
def _euclidean_keyboard_distance(char1, char2): |
|
155
|
1 |
|
row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1)) |
|
156
|
1 |
|
row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2)) |
|
157
|
1 |
|
return ((row1 - row2) ** 2 + (col1 - col2) ** 2) ** 0.5 |
|
158
|
|
|
|
|
159
|
1 |
|
def _manhattan_keyboard_distance(char1, char2): |
|
160
|
1 |
|
row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1)) |
|
161
|
1 |
|
row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2)) |
|
162
|
1 |
|
return abs(row1 - row2) + abs(col1 - col2) |
|
163
|
|
|
|
|
164
|
1 |
|
def _log_euclidean_keyboard_distance(char1, char2): |
|
165
|
1 |
|
return log(1 + _euclidean_keyboard_distance(char1, char2)) |
|
166
|
|
|
|
|
167
|
1 |
|
def _log_manhattan_keyboard_distance(char1, char2): |
|
168
|
1 |
|
return log(1 + _manhattan_keyboard_distance(char1, char2)) |
|
169
|
|
|
|
|
170
|
1 |
|
metric_dict = { |
|
171
|
|
|
'euclidean': _euclidean_keyboard_distance, |
|
172
|
|
|
'manhattan': _manhattan_keyboard_distance, |
|
173
|
|
|
'log-euclidean': _log_euclidean_keyboard_distance, |
|
174
|
|
|
'log-manhattan': _log_manhattan_keyboard_distance, |
|
175
|
|
|
} |
|
176
|
|
|
|
|
177
|
1 |
|
def _substitution_cost(char1, char2): |
|
178
|
1 |
|
cost = sub_cost |
|
179
|
1 |
|
cost *= metric_dict[metric](char1, char2) + shift_cost * ( |
|
180
|
|
|
_kb_array_for_char(char1) != _kb_array_for_char(char2) |
|
181
|
|
|
) |
|
182
|
1 |
|
return cost |
|
183
|
|
|
|
|
184
|
1 |
|
d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32) |
|
185
|
1 |
|
for i in range(len(src) + 1): |
|
186
|
1 |
|
d_mat[i, 0] = i * del_cost |
|
187
|
1 |
|
for j in range(len(tar) + 1): |
|
188
|
1 |
|
d_mat[0, j] = j * ins_cost |
|
189
|
|
|
|
|
190
|
1 |
|
for i in range(len(src)): |
|
|
|
|
|
|
191
|
1 |
|
for j in range(len(tar)): |
|
|
|
|
|
|
192
|
1 |
|
d_mat[i + 1, j + 1] = min( |
|
193
|
|
|
d_mat[i + 1, j] + ins_cost, # ins |
|
194
|
|
|
d_mat[i, j + 1] + del_cost, # del |
|
195
|
|
|
d_mat[i, j] |
|
196
|
|
|
+ ( |
|
197
|
|
|
_substitution_cost(src[i], tar[j]) |
|
198
|
|
|
if src[i] != tar[j] |
|
199
|
|
|
else 0 |
|
200
|
|
|
), # sub/== |
|
201
|
|
|
) |
|
202
|
|
|
|
|
203
|
1 |
|
return d_mat[len(src), len(tar)] |
|
204
|
|
|
|
|
205
|
|
|
|
|
206
|
1 |
|
def dist_typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5)): |
|
207
|
|
|
"""Return the normalized typo distance between two strings. |
|
208
|
|
|
|
|
209
|
|
|
This is typo distance, normalized to [0, 1]. |
|
210
|
|
|
|
|
211
|
|
|
:param str src: source string for comparison |
|
212
|
|
|
:param str tar: target string for comparison |
|
213
|
|
|
:param str metric: supported values include: 'euclidean', 'manhattan', |
|
214
|
|
|
'log-euclidean', and 'log-manhattan' |
|
215
|
|
|
:param tuple cost: a 4-tuple representing the cost of the four possible |
|
216
|
|
|
edits: inserts, deletes, substitutions, and shift, respectively (by |
|
217
|
|
|
default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be |
|
218
|
|
|
significantly less than the cost of an insertion & deletion unless |
|
219
|
|
|
a log metric is used. |
|
220
|
|
|
:returns: normalized typo distance |
|
221
|
|
|
:rtype: float |
|
222
|
|
|
|
|
223
|
|
|
>>> round(dist_typo('cat', 'hat'), 12) |
|
224
|
|
|
0.527046283086 |
|
225
|
|
|
>>> round(dist_typo('Niall', 'Neil'), 12) |
|
226
|
|
|
0.565028142929 |
|
227
|
|
|
>>> round(dist_typo('Colin', 'Cuilen'), 12) |
|
228
|
|
|
0.569035609563 |
|
229
|
|
|
>>> dist_typo('ATCG', 'TAGC') |
|
230
|
|
|
0.625 |
|
231
|
|
|
""" |
|
232
|
1 |
|
if src == tar: |
|
233
|
1 |
|
return 0 |
|
234
|
1 |
|
ins_cost, del_cost = cost[:2] |
|
235
|
1 |
|
return typo(src, tar, metric, cost) / ( |
|
236
|
|
|
max(len(src) * del_cost, len(tar) * ins_cost) |
|
237
|
|
|
) |
|
238
|
|
|
|
|
239
|
|
|
|
|
240
|
1 |
|
def sim_typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5)): |
|
241
|
|
|
"""Return the normalized typo similarity between two strings. |
|
242
|
|
|
|
|
243
|
|
|
Normalized typo similarity is the complement of normalized typo distance: |
|
244
|
|
|
:math:`sim_{typo} = 1 - dist_{typo}`. |
|
245
|
|
|
|
|
246
|
|
|
:param str src: source string for comparison |
|
247
|
|
|
:param str tar: target string for comparison |
|
248
|
|
|
:param str metric: supported values include: 'euclidean', 'manhattan', |
|
249
|
|
|
'log-euclidean', and 'log-manhattan' |
|
250
|
|
|
:param tuple cost: a 4-tuple representing the cost of the four possible |
|
251
|
|
|
edits: inserts, deletes, substitutions, and shift, respectively (by |
|
252
|
|
|
default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be |
|
253
|
|
|
significantly less than the cost of an insertion & deletion unless |
|
254
|
|
|
a log metric is used. |
|
255
|
|
|
:returns: normalized typo similarity |
|
256
|
|
|
:rtype: float |
|
257
|
|
|
|
|
258
|
|
|
>>> round(sim_typo('cat', 'hat'), 12) |
|
259
|
|
|
0.472953716914 |
|
260
|
|
|
>>> round(sim_typo('Niall', 'Neil'), 12) |
|
261
|
|
|
0.434971857071 |
|
262
|
|
|
>>> round(sim_typo('Colin', 'Cuilen'), 12) |
|
263
|
|
|
0.430964390437 |
|
264
|
|
|
>>> sim_typo('ATCG', 'TAGC') |
|
265
|
|
|
0.375 |
|
266
|
|
|
""" |
|
267
|
1 |
|
return 1 - dist_typo(src, tar, metric, cost) |
|
268
|
|
|
|
|
269
|
|
|
|
|
270
|
|
|
if __name__ == '__main__': |
|
271
|
|
|
import doctest |
|
272
|
|
|
|
|
273
|
|
|
doctest.testmod() |
|
274
|
|
|
|