1
|
|
|
# Copyright 2018-2020 by Christopher C. Little. |
2
|
|
|
# This file is part of Abydos. |
3
|
|
|
# |
4
|
|
|
# Abydos is free software: you can redistribute it and/or modify |
5
|
|
|
# it under the terms of the GNU General Public License as published by |
6
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
7
|
|
|
# (at your option) any later version. |
8
|
|
|
# |
9
|
|
|
# Abydos is distributed in the hope that it will be useful, |
10
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
11
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12
|
|
|
# GNU General Public License for more details. |
13
|
|
|
# |
14
|
|
|
# You should have received a copy of the GNU General Public License |
15
|
|
|
# along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
16
|
|
|
|
17
|
|
|
"""abydos.distance._typo. |
18
|
|
|
|
19
|
1 |
|
Typo edit distance functions. |
20
|
|
|
""" |
21
|
|
|
|
22
|
|
|
from itertools import chain |
23
|
|
|
from math import log |
24
|
1 |
|
from typing import Any, Dict, Tuple, cast |
25
|
|
|
|
26
|
|
|
from numpy import float_ as np_float |
27
|
|
|
from numpy import zeros as np_zeros |
28
|
|
|
|
29
|
|
|
from ._distance import _Distance |
30
|
|
|
|
31
|
1 |
|
|
32
|
1 |
|
__all__ = ['Typo'] |
33
|
|
|
|
34
|
1 |
|
|
35
|
|
|
class Typo(_Distance): |
36
|
1 |
|
"""Typo distance. |
37
|
1 |
|
|
38
|
|
|
This is inspired by Typo-Distance :cite:`Song:2011`, and a fair bit of |
39
|
1 |
|
this was copied from that module. Compared to the original, this supports |
40
|
|
|
different metrics for substitution. |
41
|
1 |
|
|
42
|
1 |
|
.. versionadded:: 0.3.6 |
43
|
|
|
""" |
44
|
|
|
|
45
|
1 |
|
# fmt: off |
46
|
|
|
_keyboard = {'QWERTY': ( |
47
|
|
|
(('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '-', '='), |
48
|
1 |
|
('', 'q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '[', ']', |
49
|
|
|
'\\'), |
50
|
|
|
('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', ';', "'"), |
51
|
|
|
('', 'z', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '/'), |
52
|
|
|
('', '', '', ' ')), |
53
|
|
|
(('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+'), |
54
|
|
|
('', 'Q', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '{', '}', '|'), |
55
|
|
|
('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', ':', '"'), |
56
|
|
|
('', 'Z', 'X', 'C', 'V', 'B', 'N', 'M', '<', '>', '?'), |
57
|
|
|
('', '', '', ' ')) |
58
|
|
|
), 'Dvorak': ( |
59
|
1 |
|
(('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '[', ']'), |
60
|
|
|
('', "'", ',', '.', 'p', 'y', 'f', 'g', 'c', 'r', 'l', '/', '=', |
61
|
|
|
'\\'), |
62
|
|
|
('', 'a', 'o', 'e', 'u', 'i', 'd', 'h', 't', 'n', 's', '-'), |
63
|
|
|
('', ';', 'q', 'j', 'k', 'x', 'b', 'm', 'w', 'v', 'z'), |
64
|
|
|
('', '', '', ' ')), |
65
|
|
|
(('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '{', '}'), |
66
|
|
|
('', '"', '<', '>', 'P', 'Y', 'F', 'G', 'C', 'R', 'L', '?', '+', '|'), |
67
|
|
|
('', 'A', 'O', 'E', 'U', 'I', 'D', 'H', 'T', 'N', 'S', '_'), |
68
|
|
|
('', ':', 'Q', 'J', 'K', 'X', 'B', 'M', 'W', 'V', 'Z'), |
69
|
|
|
('', '', '', ' ')) |
70
|
|
|
), 'AZERTY': ( |
71
|
|
|
(('²', '&', 'é', '"', "'", '(', '-', 'è', '_', 'ç', 'à', ')', '='), |
72
|
|
|
('', 'a', 'z', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '', '$'), |
73
|
|
|
('', 'q', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'ù', '*'), |
74
|
|
|
('<', 'w', 'x', 'c', 'v', 'b', 'n', ',', ';', ':', '!'), |
75
|
|
|
('', '', '', ' ')), |
76
|
|
|
(('~', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '°', '+'), |
77
|
|
|
('', 'A', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '', '£'), |
78
|
|
|
('', 'Q', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'Ù', 'μ'), |
79
|
|
|
('>', 'W', 'X', 'C', 'V', 'B', 'N', '?', '.', '/', '§'), |
80
|
|
|
('', '', '', ' ')) |
81
|
|
|
), 'QWERTZ': ( |
82
|
|
|
(('', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'ß', ''), |
83
|
|
|
('', 'q', 'w', 'e', 'r', 't', 'z', 'u', 'i', 'o', 'p', ' ü', '+', |
84
|
|
|
'\\'), |
85
|
|
|
('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'ö', 'ä', '#'), |
86
|
|
|
('<', 'y', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '-'), |
87
|
|
|
('', '', '', ' ')), |
88
|
|
|
(('°', '!', '"', '§', '$', '%', '&', '/', '(', ')', '=', '?', ''), |
89
|
|
|
('', 'Q', 'W', 'E', 'R', 'T', 'Z', 'U', 'I', 'O', 'P', 'Ü', '*', ''), |
90
|
|
|
('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Ö', 'Ä', "'"), |
91
|
|
|
('>', 'Y', 'X', 'C', 'V', 'B', 'N', 'M', ';', ':', '_'), |
92
|
|
|
('', '', '', ' ')) |
93
|
|
|
), 'QWERTZ_HUN': ( |
94
|
|
|
(('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'ö', 'ü', 'ó'), |
95
|
|
|
('', 'q', 'w', 'e', 'r', 't', 'z', 'u', 'i', 'o', 'p', ' ő', 'ú', |
96
|
|
|
'\\'), |
97
|
|
|
('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'é', 'á', 'ű'), |
98
|
|
|
('í', 'y', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '-'), |
99
|
|
|
('', '', '', ' ')), |
100
|
|
|
(('§', "'", '"', '+', '!', '%', '/', '=', '(', ')', 'Ö', 'Ü', 'Ó'), |
101
|
|
|
('', 'Q', 'W', 'E', 'R', 'T', 'Z', 'U', 'I', 'O', 'P', 'Ő', 'Ú', ''), |
102
|
|
|
('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'É', 'Á', 'Ű'), |
103
|
|
|
('Í', 'Y', 'X', 'C', 'V', 'B', 'N', 'M', '?', ':', '_'), |
104
|
|
|
('', '', '', ' ')), |
105
|
|
|
(('', '~', 'ˇ', '^', '˘', '°', '˛', '`', '˙', '´', '˝', '¨', '¸'), |
106
|
|
|
('', '\\', '|', '', '', '', '', '€', '', '', '', '÷', '×', ''), |
107
|
|
|
('', '', 'đ', 'Đ', '[', ']', '', '', 'ł', 'Ł', '$', 'ß', '¤'), |
108
|
|
|
('<', '>', '#', '&', '@', '{', '}', '', ';', '>', '*'), |
109
|
1 |
|
('', '', '', ' ')) |
110
|
|
|
)} # type: Dict[str, Tuple[Tuple[Tuple[str, ...], ...], ...]] |
111
|
|
|
# fmt: on |
112
|
|
|
|
113
|
|
|
def __init__( |
114
|
|
|
self, |
115
|
|
|
metric: str = 'euclidean', |
116
|
|
|
cost: Tuple[float, float, float, float] = (1.0, 1.0, 0.5, 0.5), |
117
|
|
|
layout: str = 'QWERTY', |
118
|
|
|
failsafe: bool = False, |
119
|
|
|
**kwargs: Any |
120
|
|
|
): |
121
|
|
|
"""Initialize Typo instance. |
122
|
|
|
|
123
|
|
|
Parameters |
124
|
|
|
---------- |
125
|
|
|
metric : str |
126
|
|
|
Supported values include: ``euclidean``, ``manhattan``, |
127
|
|
|
``log-euclidean``, and ``log-manhattan`` |
128
|
|
|
cost : tuple |
129
|
|
|
A 4-tuple representing the cost of the four possible edits: |
130
|
|
|
inserts, deletes, substitutions, and shift, respectively (by |
131
|
|
|
default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be |
132
|
|
|
significantly less than the cost of an insertion & deletion unless |
133
|
|
|
a log metric is used. |
134
|
|
|
layout : str |
135
|
|
|
Name of the keyboard layout to use (Currently supported: |
136
|
|
|
``QWERTY``, ``Dvorak``, ``AZERTY``, ``QWERTZ``, ``auto``). If |
137
|
|
|
``auto`` is selected, the class will attempt to determine an |
138
|
|
|
appropriate keyboard based on the supplied words. |
139
|
|
|
failsafe : bool |
140
|
|
|
If True, substitution of an unknown character (one not present on |
141
|
|
|
the selected keyboard) will incur a cost equal to an insertion plus |
142
|
|
|
a deletion. |
143
|
|
|
**kwargs |
144
|
|
|
Arbitrary keyword arguments |
145
|
|
|
|
146
|
1 |
|
|
147
|
1 |
|
.. versionadded:: 0.4.0 |
148
|
1 |
|
|
149
|
1 |
|
""" |
150
|
1 |
|
super(Typo, self).__init__(**kwargs) |
151
|
|
|
self._metric = metric |
152
|
1 |
|
self._cost = cost |
153
|
|
|
self._layout = layout |
154
|
|
|
self._failsafe = failsafe |
155
|
|
|
|
156
|
|
|
def dist_abs(self, src: str, tar: str) -> float: |
157
|
|
|
"""Return the typo distance between two strings. |
158
|
|
|
|
159
|
|
|
Parameters |
160
|
|
|
---------- |
161
|
|
|
src : str |
162
|
|
|
Source string for comparison |
163
|
|
|
tar : str |
164
|
|
|
Target string for comparison |
165
|
|
|
|
166
|
|
|
Returns |
167
|
|
|
------- |
168
|
|
|
float |
169
|
|
|
Typo distance |
170
|
|
|
|
171
|
|
|
Raises |
172
|
|
|
------ |
173
|
|
|
ValueError |
174
|
|
|
char not found in any keyboard layouts |
175
|
|
|
|
176
|
|
|
Examples |
177
|
|
|
-------- |
178
|
|
|
>>> cmp = Typo() |
179
|
|
|
>>> cmp.dist_abs('cat', 'hat') |
180
|
|
|
1.5811388300841898 |
181
|
|
|
>>> cmp.dist_abs('Niall', 'Neil') |
182
|
|
|
2.8251407699364424 |
183
|
|
|
>>> cmp.dist_abs('Colin', 'Cuilen') |
184
|
|
|
3.414213562373095 |
185
|
|
|
>>> cmp.dist_abs('ATCG', 'TAGC') |
186
|
|
|
2.5 |
187
|
|
|
|
188
|
|
|
>>> cmp = Typo(metric='manhattan') |
189
|
|
|
>>> cmp.dist_abs('cat', 'hat') |
190
|
|
|
2.0 |
191
|
|
|
>>> cmp.dist_abs('Niall', 'Neil') |
192
|
|
|
3.0 |
193
|
|
|
>>> cmp.dist_abs('Colin', 'Cuilen') |
194
|
|
|
3.5 |
195
|
|
|
>>> cmp.dist_abs('ATCG', 'TAGC') |
196
|
|
|
2.5 |
197
|
|
|
|
198
|
|
|
>>> cmp = Typo(metric='log-manhattan') |
199
|
|
|
>>> cmp.dist_abs('cat', 'hat') |
200
|
|
|
0.8047189562170501 |
201
|
|
|
>>> cmp.dist_abs('Niall', 'Neil') |
202
|
|
|
2.2424533248940004 |
203
|
|
|
>>> cmp.dist_abs('Colin', 'Cuilen') |
204
|
|
|
2.242453324894 |
205
|
|
|
>>> cmp.dist_abs('ATCG', 'TAGC') |
206
|
|
|
2.3465735902799727 |
207
|
|
|
|
208
|
|
|
|
209
|
|
|
.. versionadded:: 0.3.0 |
210
|
1 |
|
.. versionchanged:: 0.3.6 |
211
|
|
|
Encapsulated in class |
212
|
1 |
|
|
213
|
1 |
|
""" |
214
|
1 |
|
ins_cost, del_cost, sub_cost, shift_cost = self._cost |
215
|
1 |
|
|
216
|
1 |
|
if src == tar: |
217
|
1 |
|
return 0.0 |
218
|
|
|
if not src: |
219
|
1 |
|
return len(tar) * ins_cost |
220
|
1 |
|
if not tar: |
221
|
1 |
|
return len(src) * del_cost |
222
|
1 |
|
|
223
|
1 |
|
if self._layout == 'auto': |
224
|
1 |
|
for kb in ['QWERTY', 'QWERTZ', 'AZERTY']: |
225
|
1 |
|
keys = set(chain(*chain(*self._keyboard[kb]))) |
226
|
|
|
letters = set(src) | set(tar) |
227
|
|
|
if not (letters - keys): |
228
|
1 |
|
keyboard = self._keyboard[kb] |
229
|
|
|
break |
230
|
1 |
|
else: |
231
|
|
|
# Fallback to QWERTY |
232
|
1 |
|
keyboard = self._keyboard['QWERTY'] |
233
|
1 |
|
else: |
234
|
1 |
|
keyboard = self._keyboard[self._layout] |
235
|
|
|
|
236
|
1 |
|
kb_array = [] |
237
|
|
|
for kb_mode in keyboard: |
238
|
|
|
kb_array.append({item for sublist in kb_mode for item in sublist}) |
239
|
|
|
keys = set(chain(*chain(*keyboard))) |
240
|
|
|
|
241
|
|
|
def _kb_array_for_char(char: str) -> Tuple[Tuple[str, ...], ...]: |
242
|
|
|
"""Return the keyboard layout that contains ch. |
243
|
|
|
|
244
|
|
|
Parameters |
245
|
|
|
---------- |
246
|
|
|
char : str |
247
|
|
|
The character to lookup |
248
|
|
|
|
249
|
|
|
Returns |
250
|
|
|
------- |
251
|
|
|
tuple |
252
|
|
|
A keyboard |
253
|
|
|
|
254
|
|
|
Raises |
255
|
|
|
------ |
256
|
|
|
ValueError |
257
|
1 |
|
char not found in any keyboard layouts |
258
|
1 |
|
|
259
|
1 |
|
.. versionadded:: 0.3.0 |
260
|
1 |
|
|
261
|
1 |
|
""" |
262
|
|
|
for i, kb_mode in enumerate(kb_array): |
263
|
1 |
|
if char in kb_mode: |
264
|
1 |
|
return keyboard[i] |
265
|
1 |
|
raise ValueError(char + ' not found in any keyboard layouts') |
266
|
1 |
|
|
267
|
1 |
|
def _substitution_cost(char1: str, char2: str) -> float: |
268
|
|
|
if self._failsafe and (char1 not in keys or char2 not in keys): |
269
|
|
|
return ins_cost + del_cost |
270
|
1 |
|
cost = sub_cost |
271
|
|
|
cost *= metric_dict[self._metric](char1, char2) + shift_cost * ( |
272
|
1 |
|
_kb_array_for_char(char1) != _kb_array_for_char(char2) |
273
|
|
|
) |
274
|
|
|
return cost |
275
|
|
|
|
276
|
|
|
def _get_char_coord( |
277
|
|
|
char: str, kb_array: Tuple[Tuple[str, ...], ...] |
278
|
|
|
) -> Tuple[int, int]: |
279
|
|
|
"""Return the row & column of char in the keyboard. |
280
|
|
|
|
281
|
|
|
Parameters |
282
|
|
|
---------- |
283
|
|
|
char : str |
284
|
|
|
The character to search for |
285
|
|
|
kb_array : tuple of tuples |
286
|
|
|
The array of key positions |
287
|
|
|
|
288
|
|
|
Returns |
289
|
|
|
------- |
290
|
1 |
|
tuple |
291
|
1 |
|
The row & column of the key |
292
|
1 |
|
|
293
|
|
|
.. versionadded:: 0.3.0 |
294
|
1 |
|
|
295
|
1 |
|
""" |
296
|
1 |
|
for row in kb_array: # pragma: no branch |
297
|
1 |
|
if char in row: |
298
|
|
|
break |
299
|
1 |
|
return kb_array.index(row), row.index(char) |
|
|
|
|
300
|
1 |
|
|
301
|
1 |
|
def _euclidean_keyboard_distance(char1: str, char2: str) -> float: |
302
|
1 |
|
row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1)) |
303
|
|
|
row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2)) |
304
|
1 |
|
return ((row1 - row2) ** 2 + (col1 - col2) ** 2) ** 0.5 |
305
|
1 |
|
|
306
|
|
|
def _manhattan_keyboard_distance(char1: str, char2: str) -> float: |
307
|
1 |
|
row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1)) |
308
|
1 |
|
row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2)) |
309
|
|
|
return abs(row1 - row2) + abs(col1 - col2) |
310
|
1 |
|
|
311
|
|
|
def _log_euclidean_keyboard_distance(char1: str, char2: str) -> float: |
312
|
|
|
return log(1 + _euclidean_keyboard_distance(char1, char2)) |
313
|
|
|
|
314
|
|
|
def _log_manhattan_keyboard_distance(char1: str, char2: str) -> float: |
315
|
|
|
return log(1 + _manhattan_keyboard_distance(char1, char2)) |
316
|
|
|
|
317
|
1 |
|
metric_dict = { |
318
|
1 |
|
'euclidean': _euclidean_keyboard_distance, |
319
|
1 |
|
'manhattan': _manhattan_keyboard_distance, |
320
|
1 |
|
'log-euclidean': _log_euclidean_keyboard_distance, |
321
|
1 |
|
'log-manhattan': _log_manhattan_keyboard_distance, |
322
|
|
|
} |
323
|
1 |
|
|
324
|
1 |
|
d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float) |
325
|
1 |
|
for i in range(len(src) + 1): |
326
|
|
|
d_mat[i, 0] = i * del_cost |
327
|
|
|
for j in range(len(tar) + 1): |
328
|
|
|
d_mat[0, j] = j * ins_cost |
329
|
|
|
|
330
|
|
|
for i in range(len(src)): |
331
|
|
|
for j in range(len(tar)): |
332
|
|
|
d_mat[i + 1, j + 1] = min( |
333
|
|
|
d_mat[i + 1, j] + ins_cost, # ins |
334
|
|
|
d_mat[i, j + 1] + del_cost, # del |
335
|
|
|
d_mat[i, j] |
336
|
1 |
|
+ ( |
337
|
|
|
_substitution_cost(src[i], tar[j]) |
338
|
1 |
|
if src[i] != tar[j] |
339
|
|
|
else 0 |
340
|
|
|
), # sub/== |
341
|
|
|
) |
342
|
|
|
|
343
|
|
|
return cast(float, d_mat[len(src), len(tar)]) |
344
|
|
|
|
345
|
|
|
def dist(self, src: str, tar: str) -> float: |
346
|
|
|
"""Return the normalized typo distance between two strings. |
347
|
|
|
|
348
|
|
|
This is typo distance, normalized to [0, 1]. |
349
|
|
|
|
350
|
|
|
Parameters |
351
|
|
|
---------- |
352
|
|
|
src : str |
353
|
|
|
Source string for comparison |
354
|
|
|
tar : str |
355
|
|
|
Target string for comparison |
356
|
|
|
|
357
|
|
|
Returns |
358
|
|
|
------- |
359
|
|
|
float |
360
|
|
|
Normalized typo distance |
361
|
|
|
|
362
|
|
|
Examples |
363
|
|
|
-------- |
364
|
|
|
>>> cmp = Typo() |
365
|
|
|
>>> round(cmp.dist('cat', 'hat'), 12) |
366
|
|
|
0.527046276695 |
367
|
|
|
>>> round(cmp.dist('Niall', 'Neil'), 12) |
368
|
|
|
0.565028153987 |
369
|
|
|
>>> round(cmp.dist('Colin', 'Cuilen'), 12) |
370
|
|
|
0.569035593729 |
371
|
|
|
>>> cmp.dist('ATCG', 'TAGC') |
372
|
|
|
0.625 |
373
|
1 |
|
|
374
|
1 |
|
|
375
|
1 |
|
.. versionadded:: 0.3.0 |
376
|
1 |
|
.. versionchanged:: 0.3.6 |
377
|
|
|
Encapsulated in class |
378
|
|
|
|
379
|
|
|
""" |
380
|
|
|
if src == tar: |
381
|
1 |
|
return 0.0 |
382
|
|
|
ins_cost, del_cost = self._cost[:2] |
383
|
|
|
return self.dist_abs(src, tar) / ( |
384
|
|
|
max(len(src) * del_cost, len(tar) * ins_cost) |
385
|
|
|
) |
386
|
|
|
|
387
|
1 |
|
|
388
|
|
|
if __name__ == '__main__': |
389
|
|
|
import doctest |
390
|
|
|
|
391
|
|
|
doctest.testmod() |
392
|
|
|
|