1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
|
3
|
|
|
# Copyright 2018 by Christopher C. Little. |
4
|
|
|
# This file is part of Abydos. |
5
|
|
|
# |
6
|
|
|
# Abydos is free software: you can redistribute it and/or modify |
7
|
|
|
# it under the terms of the GNU General Public License as published by |
8
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
9
|
|
|
# (at your option) any later version. |
10
|
|
|
# |
11
|
|
|
# Abydos is distributed in the hope that it will be useful, |
12
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
13
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14
|
|
|
# GNU General Public License for more details. |
15
|
|
|
# |
16
|
|
|
# You should have received a copy of the GNU General Public License |
17
|
|
|
# along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
18
|
|
|
|
19
|
1 |
|
"""abydos.distance.typo. |
20
|
|
|
|
21
|
|
|
The distance.typo module implements typo edit distance functions. |
22
|
|
|
""" |
23
|
|
|
|
24
|
1 |
|
from __future__ import division, unicode_literals |
25
|
|
|
|
26
|
1 |
|
from math import log |
27
|
|
|
|
28
|
1 |
|
from numpy import float32 as np_float32 |
29
|
1 |
|
from numpy import zeros as np_zeros |
30
|
|
|
|
31
|
1 |
|
from six.moves import range |
32
|
|
|
|
33
|
1 |
|
__all__ = ['dist_typo', 'sim_typo', 'typo'] |
34
|
|
|
|
35
|
|
|
|
36
|
1 |
|
def typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5), layout='QWERTY'): |
|
|
|
|
37
|
|
|
"""Return the typo distance between two strings. |
38
|
|
|
|
39
|
|
|
This is inspired by Typo-Distance :cite:`Song:2011`, and a fair bit of |
40
|
|
|
this was copied from that module. Compared to the original, this supports |
41
|
|
|
different metrics for substitution. |
42
|
|
|
|
43
|
|
|
:param str src: source string for comparison |
44
|
|
|
:param str tar: target string for comparison |
45
|
|
|
:param str metric: supported values include: 'euclidean', 'manhattan', |
46
|
|
|
'log-euclidean', and 'log-manhattan' |
47
|
|
|
:param tuple cost: a 4-tuple representing the cost of the four possible |
48
|
|
|
edits: inserts, deletes, substitutions, and shift, respectively (by |
49
|
|
|
default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be |
50
|
|
|
significantly less than the cost of an insertion & deletion unless |
51
|
|
|
a log metric is used. |
52
|
|
|
:param str layout: name of the keyboard layout to use (Currently supported: |
53
|
|
|
QWERTY, Dvorak, AZERTY, QWERTZ) |
54
|
|
|
:returns: typo distance |
55
|
|
|
:rtype: float |
56
|
|
|
|
57
|
|
|
>>> typo('cat', 'hat') |
58
|
|
|
1.5811388 |
59
|
|
|
>>> typo('Niall', 'Neil') |
60
|
|
|
2.8251407 |
61
|
|
|
>>> typo('Colin', 'Cuilen') |
62
|
|
|
3.4142137 |
63
|
|
|
>>> typo('ATCG', 'TAGC') |
64
|
|
|
2.5 |
65
|
|
|
|
66
|
|
|
>>> typo('cat', 'hat', metric='manhattan') |
67
|
|
|
2.0 |
68
|
|
|
>>> typo('Niall', 'Neil', metric='manhattan') |
69
|
|
|
3.0 |
70
|
|
|
>>> typo('Colin', 'Cuilen', metric='manhattan') |
71
|
|
|
3.5 |
72
|
|
|
>>> typo('ATCG', 'TAGC', metric='manhattan') |
73
|
|
|
2.5 |
74
|
|
|
|
75
|
|
|
>>> typo('cat', 'hat', metric='log-manhattan') |
76
|
|
|
0.804719 |
77
|
|
|
>>> typo('Niall', 'Neil', metric='log-manhattan') |
78
|
|
|
2.2424533 |
79
|
|
|
>>> typo('Colin', 'Cuilen', metric='log-manhattan') |
80
|
|
|
2.2424533 |
81
|
|
|
>>> typo('ATCG', 'TAGC', metric='log-manhattan') |
82
|
|
|
2.3465736 |
83
|
|
|
""" |
84
|
1 |
|
ins_cost, del_cost, sub_cost, shift_cost = cost |
85
|
|
|
|
86
|
1 |
|
if src == tar: |
87
|
1 |
|
return 0.0 |
88
|
1 |
|
if not src: |
89
|
1 |
|
return len(tar) * ins_cost |
90
|
1 |
|
if not tar: |
91
|
1 |
|
return len(src) * del_cost |
92
|
|
|
|
93
|
|
|
# fmt: off |
94
|
1 |
|
kbs = {'QWERTY': ( |
95
|
|
|
(('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '-', '='), |
96
|
|
|
('', 'q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '[', ']', |
97
|
|
|
'\\'), |
98
|
|
|
('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', ';', '\''), |
99
|
|
|
('', 'z', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '/')), |
100
|
|
|
(('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+'), |
101
|
|
|
('', 'Q', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '{', '}', '|'), |
102
|
|
|
('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', ':', '"'), |
103
|
|
|
('', 'Z', 'X', 'C', 'V', 'B', 'N', 'M', '<', '>', '?')) |
104
|
|
|
), 'Dvorak': ( |
105
|
|
|
(('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '[', ']'), |
106
|
|
|
('', '\'', ',', '.', 'p', 'y', 'f', 'g', 'c', 'r', 'l', '/', '=', |
107
|
|
|
'\\'), |
108
|
|
|
('', 'a', 'o', 'e', 'u', 'i', 'd', 'h', 't', 'n', 's', '-'), |
109
|
|
|
('', ';', 'q', 'j', 'k', 'x', 'b', 'm', 'w', 'v', 'z')), |
110
|
|
|
(('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '{', '}'), |
111
|
|
|
('', '"', '<', '>', 'P', 'Y', 'F', 'G', 'C', 'R', 'L', '?', '+', '|'), |
112
|
|
|
('', 'A', 'O', 'E', 'U', 'I', 'D', 'H', 'T', 'N', 'S', '_'), |
113
|
|
|
('', ':', 'Q', 'J', 'K', 'X', 'B', 'M', 'W', 'V', 'Z')) |
114
|
|
|
), 'AZERTY': ( |
115
|
|
|
(('²', '&', 'é', '"', '\'', '(', '-', 'è', '_', 'ç', 'à', ')', '='), |
116
|
|
|
('', 'a', 'z', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '', '$'), |
117
|
|
|
('', 'q', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'ù', '*'), |
118
|
|
|
('<', 'w', 'x', 'c', 'v', 'b', 'n', ',', ';', ':', '!')), |
119
|
|
|
(('~', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '°', '+'), |
120
|
|
|
('', 'A', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '', '£'), |
121
|
|
|
('', 'Q', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'Ù', 'μ'), |
122
|
|
|
('>', 'W', 'X', 'C', 'V', 'B', 'N', '?', '.', '/', '§')) |
123
|
|
|
), 'QWERTZ': ( |
124
|
|
|
(('', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'ß', ''), |
125
|
|
|
('', 'q', 'w', 'e', 'r', 't', 'z', 'u', 'i', 'o', 'p', ' ü', '+', |
126
|
|
|
'\\'), |
127
|
|
|
('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'ö', 'ä', '#'), |
128
|
|
|
('<', 'y', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '-')), |
129
|
|
|
(('°', '!', '"', '§', '$', '%', '&', '/', '(', ')', '=', '?', ''), |
130
|
|
|
('', 'Q', 'W', 'E', 'R', 'T', 'Z', 'U', 'I', 'O', 'P', 'Ü', '*', ''), |
131
|
|
|
('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Ö', 'Ä', '\''), |
132
|
|
|
('>', 'Y', 'X', 'C', 'V', 'B', 'N', 'M', ';', ':', '_')) |
133
|
|
|
)} |
134
|
|
|
# fmt: on |
135
|
|
|
|
136
|
1 |
|
keyboard = kbs[layout] |
137
|
1 |
|
lowercase = {item for sublist in keyboard[0] for item in sublist} |
138
|
1 |
|
uppercase = {item for sublist in keyboard[1] for item in sublist} |
139
|
|
|
|
140
|
1 |
|
def _kb_array_for_char(char): |
141
|
|
|
"""Return the keyboard layout that contains ch.""" |
142
|
1 |
|
if char in lowercase: |
143
|
1 |
|
return keyboard[0] |
144
|
1 |
|
elif char in uppercase: |
145
|
1 |
|
return keyboard[1] |
146
|
1 |
|
raise ValueError(char + ' not found in any keyboard layouts') |
147
|
|
|
|
148
|
1 |
|
def _get_char_coord(char, kb_array): |
149
|
|
|
"""Return the row & column of char in the keyboard.""" |
150
|
1 |
|
for row in kb_array: # pragma: no branch |
151
|
1 |
|
if char in row: |
152
|
1 |
|
return kb_array.index(row), row.index(char) |
153
|
|
|
|
154
|
1 |
|
def _euclidean_keyboard_distance(char1, char2): |
155
|
1 |
|
row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1)) |
156
|
1 |
|
row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2)) |
157
|
1 |
|
return ((row1 - row2) ** 2 + (col1 - col2) ** 2) ** 0.5 |
158
|
|
|
|
159
|
1 |
|
def _manhattan_keyboard_distance(char1, char2): |
160
|
1 |
|
row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1)) |
161
|
1 |
|
row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2)) |
162
|
1 |
|
return abs(row1 - row2) + abs(col1 - col2) |
163
|
|
|
|
164
|
1 |
|
def _log_euclidean_keyboard_distance(char1, char2): |
165
|
1 |
|
return log(1 + _euclidean_keyboard_distance(char1, char2)) |
166
|
|
|
|
167
|
1 |
|
def _log_manhattan_keyboard_distance(char1, char2): |
168
|
1 |
|
return log(1 + _manhattan_keyboard_distance(char1, char2)) |
169
|
|
|
|
170
|
1 |
|
metric_dict = { |
171
|
|
|
'euclidean': _euclidean_keyboard_distance, |
172
|
|
|
'manhattan': _manhattan_keyboard_distance, |
173
|
|
|
'log-euclidean': _log_euclidean_keyboard_distance, |
174
|
|
|
'log-manhattan': _log_manhattan_keyboard_distance, |
175
|
|
|
} |
176
|
|
|
|
177
|
1 |
|
def _substitution_cost(char1, char2): |
178
|
1 |
|
cost = sub_cost |
179
|
1 |
|
cost *= metric_dict[metric](char1, char2) + shift_cost * ( |
180
|
|
|
_kb_array_for_char(char1) != _kb_array_for_char(char2) |
181
|
|
|
) |
182
|
1 |
|
return cost |
183
|
|
|
|
184
|
1 |
|
d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32) |
185
|
1 |
|
for i in range(len(src) + 1): |
186
|
1 |
|
d_mat[i, 0] = i * del_cost |
187
|
1 |
|
for j in range(len(tar) + 1): |
188
|
1 |
|
d_mat[0, j] = j * ins_cost |
189
|
|
|
|
190
|
1 |
|
for i in range(len(src)): |
|
|
|
|
191
|
1 |
|
for j in range(len(tar)): |
|
|
|
|
192
|
1 |
|
d_mat[i + 1, j + 1] = min( |
193
|
|
|
d_mat[i + 1, j] + ins_cost, # ins |
194
|
|
|
d_mat[i, j + 1] + del_cost, # del |
195
|
|
|
d_mat[i, j] |
196
|
|
|
+ ( |
197
|
|
|
_substitution_cost(src[i], tar[j]) |
198
|
|
|
if src[i] != tar[j] |
199
|
|
|
else 0 |
200
|
|
|
), # sub/== |
201
|
|
|
) |
202
|
|
|
|
203
|
1 |
|
return d_mat[len(src), len(tar)] |
204
|
|
|
|
205
|
|
|
|
206
|
1 |
|
def dist_typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5)): |
207
|
|
|
"""Return the normalized typo distance between two strings. |
208
|
|
|
|
209
|
|
|
This is typo distance, normalized to [0, 1]. |
210
|
|
|
|
211
|
|
|
:param str src: source string for comparison |
212
|
|
|
:param str tar: target string for comparison |
213
|
|
|
:param str metric: supported values include: 'euclidean', 'manhattan', |
214
|
|
|
'log-euclidean', and 'log-manhattan' |
215
|
|
|
:param tuple cost: a 4-tuple representing the cost of the four possible |
216
|
|
|
edits: inserts, deletes, substitutions, and shift, respectively (by |
217
|
|
|
default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be |
218
|
|
|
significantly less than the cost of an insertion & deletion unless |
219
|
|
|
a log metric is used. |
220
|
|
|
:returns: normalized typo distance |
221
|
|
|
:rtype: float |
222
|
|
|
|
223
|
|
|
>>> round(dist_typo('cat', 'hat'), 12) |
224
|
|
|
0.527046283086 |
225
|
|
|
>>> round(dist_typo('Niall', 'Neil'), 12) |
226
|
|
|
0.565028142929 |
227
|
|
|
>>> round(dist_typo('Colin', 'Cuilen'), 12) |
228
|
|
|
0.569035609563 |
229
|
|
|
>>> dist_typo('ATCG', 'TAGC') |
230
|
|
|
0.625 |
231
|
|
|
""" |
232
|
1 |
|
if src == tar: |
233
|
1 |
|
return 0 |
234
|
1 |
|
ins_cost, del_cost = cost[:2] |
235
|
1 |
|
return typo(src, tar, metric, cost) / ( |
236
|
|
|
max(len(src) * del_cost, len(tar) * ins_cost) |
237
|
|
|
) |
238
|
|
|
|
239
|
|
|
|
240
|
1 |
|
def sim_typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5)): |
241
|
|
|
"""Return the normalized typo similarity between two strings. |
242
|
|
|
|
243
|
|
|
Normalized typo similarity is the complement of normalized typo distance: |
244
|
|
|
:math:`sim_{typo} = 1 - dist_{typo}`. |
245
|
|
|
|
246
|
|
|
:param str src: source string for comparison |
247
|
|
|
:param str tar: target string for comparison |
248
|
|
|
:param str metric: supported values include: 'euclidean', 'manhattan', |
249
|
|
|
'log-euclidean', and 'log-manhattan' |
250
|
|
|
:param tuple cost: a 4-tuple representing the cost of the four possible |
251
|
|
|
edits: inserts, deletes, substitutions, and shift, respectively (by |
252
|
|
|
default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be |
253
|
|
|
significantly less than the cost of an insertion & deletion unless |
254
|
|
|
a log metric is used. |
255
|
|
|
:returns: normalized typo similarity |
256
|
|
|
:rtype: float |
257
|
|
|
|
258
|
|
|
>>> round(sim_typo('cat', 'hat'), 12) |
259
|
|
|
0.472953716914 |
260
|
|
|
>>> round(sim_typo('Niall', 'Neil'), 12) |
261
|
|
|
0.434971857071 |
262
|
|
|
>>> round(sim_typo('Colin', 'Cuilen'), 12) |
263
|
|
|
0.430964390437 |
264
|
|
|
>>> sim_typo('ATCG', 'TAGC') |
265
|
|
|
0.375 |
266
|
|
|
""" |
267
|
1 |
|
return 1 - dist_typo(src, tar, metric, cost) |
268
|
|
|
|
269
|
|
|
|
270
|
|
|
if __name__ == '__main__': |
271
|
|
|
import doctest |
272
|
|
|
|
273
|
|
|
doctest.testmod() |
274
|
|
|
|