abydos.distance._typo.Typo.dist()   A
last analyzed

Complexity

Conditions 2

Size

Total Lines 40
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 2

Importance

Changes 0
Metric Value
eloc 6
dl 0
loc 40
ccs 5
cts 5
cp 1
rs 10
c 0
b 0
f 0
cc 2
nop 3
crap 2
1
# Copyright 2018-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.distance._typo.
18
19 1
Typo edit distance functions.
20
"""
21
22
from itertools import chain
23
from math import log
24 1
from typing import Any, Dict, Tuple, cast
25
26
from numpy import float_ as np_float
27
from numpy import zeros as np_zeros
28
29
from ._distance import _Distance
30
31 1
32 1
__all__ = ['Typo']
33
34 1
35
class Typo(_Distance):
36 1
    """Typo distance.
37 1
38
    This is inspired by Typo-Distance :cite:`Song:2011`, and a fair bit of
39 1
    this was copied from that module. Compared to the original, this supports
40
    different metrics for substitution.
41 1
42 1
    .. versionadded:: 0.3.6
43
    """
44
45 1
    # fmt: off
46
    _keyboard = {'QWERTY': (
47
        (('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '-', '='),
48 1
         ('', 'q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '[', ']',
49
          '\\'),
50
         ('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', ';', "'"),
51
         ('', 'z', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '/'),
52
         ('', '', '', ' ')),
53
        (('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+'),
54
         ('', 'Q', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '{', '}', '|'),
55
         ('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', ':', '"'),
56
         ('', 'Z', 'X', 'C', 'V', 'B', 'N', 'M', '<', '>', '?'),
57
         ('', '', '', ' '))
58
    ), 'Dvorak': (
59 1
        (('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '[', ']'),
60
         ('', "'", ',', '.', 'p', 'y', 'f', 'g', 'c', 'r', 'l', '/', '=',
61
          '\\'),
62
         ('', 'a', 'o', 'e', 'u', 'i', 'd', 'h', 't', 'n', 's', '-'),
63
         ('', ';', 'q', 'j', 'k', 'x', 'b', 'm', 'w', 'v', 'z'),
64
         ('', '', '', ' ')),
65
        (('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '{', '}'),
66
         ('', '"', '<', '>', 'P', 'Y', 'F', 'G', 'C', 'R', 'L', '?', '+', '|'),
67
         ('', 'A', 'O', 'E', 'U', 'I', 'D', 'H', 'T', 'N', 'S', '_'),
68
         ('', ':', 'Q', 'J', 'K', 'X', 'B', 'M', 'W', 'V', 'Z'),
69
         ('', '', '', ' '))
70
    ), 'AZERTY': (
71
        (('²', '&', 'é', '"', "'", '(', '-', 'è', '_', 'ç', 'à', ')', '='),
72
         ('', 'a', 'z', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '', '$'),
73
         ('', 'q', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'ù', '*'),
74
         ('<', 'w', 'x', 'c', 'v', 'b', 'n', ',', ';', ':', '!'),
75
         ('', '', '', ' ')),
76
        (('~', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '°', '+'),
77
         ('', 'A', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '', '£'),
78
         ('', 'Q', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'Ù', 'μ'),
79
         ('>', 'W', 'X', 'C', 'V', 'B', 'N', '?', '.', '/', '§'),
80
         ('', '', '', ' '))
81
    ), 'QWERTZ': (
82
        (('', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'ß', ''),
83
         ('', 'q', 'w', 'e', 'r', 't', 'z', 'u', 'i', 'o', 'p', ' ü', '+',
84
          '\\'),
85
         ('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'ö', 'ä', '#'),
86
         ('<', 'y', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '-'),
87
         ('', '', '', ' ')),
88
        (('°', '!', '"', '§', '$', '%', '&', '/', '(', ')', '=', '?', ''),
89
         ('', 'Q', 'W', 'E', 'R', 'T', 'Z', 'U', 'I', 'O', 'P', 'Ü', '*', ''),
90
         ('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Ö', 'Ä', "'"),
91
         ('>', 'Y', 'X', 'C', 'V', 'B', 'N', 'M', ';', ':', '_'),
92
         ('', '', '', ' '))
93
    ), 'QWERTZ_HUN': (
94
        (('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'ö', 'ü', 'ó'),
95
         ('', 'q', 'w', 'e', 'r', 't', 'z', 'u', 'i', 'o', 'p', ' ő', 'ú',
96
          '\\'),
97
         ('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'é', 'á', 'ű'),
98
         ('í', 'y', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '-'),
99
         ('', '', '', ' ')),
100
        (('§', "'", '"', '+', '!', '%', '/', '=', '(', ')', 'Ö', 'Ü', 'Ó'),
101
         ('', 'Q', 'W', 'E', 'R', 'T', 'Z', 'U', 'I', 'O', 'P', 'Ő', 'Ú', ''),
102
         ('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'É', 'Á', 'Ű'),
103
         ('Í', 'Y', 'X', 'C', 'V', 'B', 'N', 'M', '?', ':', '_'),
104
         ('', '', '', ' ')),
105
        (('', '~', 'ˇ', '^', '˘', '°', '˛', '`', '˙', '´', '˝', '¨', '¸'),
106
         ('', '\\', '|', '', '', '', '', '€', '', '', '', '÷', '×', ''),
107
         ('', '', 'đ', 'Đ', '[', ']', '', '', 'ł', 'Ł', '$', 'ß', '¤'),
108
         ('<', '>', '#', '&', '@', '{', '}', '', ';', '>', '*'),
109 1
         ('', '', '', ' '))
110
    )}  # type: Dict[str, Tuple[Tuple[Tuple[str, ...], ...], ...]]
111
    # fmt: on
112
113
    def __init__(
114
        self,
115
        metric: str = 'euclidean',
116
        cost: Tuple[float, float, float, float] = (1.0, 1.0, 0.5, 0.5),
117
        layout: str = 'QWERTY',
118
        failsafe: bool = False,
119
        **kwargs: Any
120
    ):
121
        """Initialize Typo instance.
122
123
        Parameters
124
        ----------
125
        metric : str
126
            Supported values include: ``euclidean``, ``manhattan``,
127
            ``log-euclidean``, and ``log-manhattan``
128
        cost : tuple
129
            A 4-tuple representing the cost of the four possible edits:
130
            inserts, deletes, substitutions, and shift, respectively (by
131
            default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be
132
            significantly less than the cost of an insertion & deletion unless
133
            a log metric is used.
134
        layout : str
135
            Name of the keyboard layout to use (Currently supported:
136
            ``QWERTY``, ``Dvorak``, ``AZERTY``, ``QWERTZ``, ``auto``). If
137
            ``auto`` is selected, the class will attempt to determine an
138
            appropriate keyboard based on the supplied words.
139
        failsafe : bool
140
            If True, substitution of an unknown character (one not present on
141
            the selected keyboard) will incur a cost equal to an insertion plus
142
            a deletion.
143
        **kwargs
144
            Arbitrary keyword arguments
145
146 1
147 1
        .. versionadded:: 0.4.0
148 1
149 1
        """
150 1
        super(Typo, self).__init__(**kwargs)
151
        self._metric = metric
152 1
        self._cost = cost
153
        self._layout = layout
154
        self._failsafe = failsafe
155
156
    def dist_abs(self, src: str, tar: str) -> float:
157
        """Return the typo distance between two strings.
158
159
        Parameters
160
        ----------
161
        src : str
162
            Source string for comparison
163
        tar : str
164
            Target string for comparison
165
166
        Returns
167
        -------
168
        float
169
            Typo distance
170
171
        Raises
172
        ------
173
        ValueError
174
            char not found in any keyboard layouts
175
176
        Examples
177
        --------
178
        >>> cmp = Typo()
179
        >>> cmp.dist_abs('cat', 'hat')
180
        1.5811388300841898
181
        >>> cmp.dist_abs('Niall', 'Neil')
182
        2.8251407699364424
183
        >>> cmp.dist_abs('Colin', 'Cuilen')
184
        3.414213562373095
185
        >>> cmp.dist_abs('ATCG', 'TAGC')
186
        2.5
187
188
        >>> cmp = Typo(metric='manhattan')
189
        >>> cmp.dist_abs('cat', 'hat')
190
        2.0
191
        >>> cmp.dist_abs('Niall', 'Neil')
192
        3.0
193
        >>> cmp.dist_abs('Colin', 'Cuilen')
194
        3.5
195
        >>> cmp.dist_abs('ATCG', 'TAGC')
196
        2.5
197
198
        >>> cmp = Typo(metric='log-manhattan')
199
        >>> cmp.dist_abs('cat', 'hat')
200
        0.8047189562170501
201
        >>> cmp.dist_abs('Niall', 'Neil')
202
        2.2424533248940004
203
        >>> cmp.dist_abs('Colin', 'Cuilen')
204
        2.242453324894
205
        >>> cmp.dist_abs('ATCG', 'TAGC')
206
        2.3465735902799727
207
208
209
        .. versionadded:: 0.3.0
210 1
        .. versionchanged:: 0.3.6
211
            Encapsulated in class
212 1
213 1
        """
214 1
        ins_cost, del_cost, sub_cost, shift_cost = self._cost
215 1
216 1
        if src == tar:
217 1
            return 0.0
218
        if not src:
219 1
            return len(tar) * ins_cost
220 1
        if not tar:
221 1
            return len(src) * del_cost
222 1
223 1
        if self._layout == 'auto':
224 1
            for kb in ['QWERTY', 'QWERTZ', 'AZERTY']:
225 1
                keys = set(chain(*chain(*self._keyboard[kb])))
226
                letters = set(src) | set(tar)
227
                if not (letters - keys):
228 1
                    keyboard = self._keyboard[kb]
229
                    break
230 1
            else:
231
                # Fallback to QWERTY
232 1
                keyboard = self._keyboard['QWERTY']
233 1
        else:
234 1
            keyboard = self._keyboard[self._layout]
235
236 1
        kb_array = []
237
        for kb_mode in keyboard:
238
            kb_array.append({item for sublist in kb_mode for item in sublist})
239
        keys = set(chain(*chain(*keyboard)))
240
241
        def _kb_array_for_char(char: str) -> Tuple[Tuple[str, ...], ...]:
242
            """Return the keyboard layout that contains ch.
243
244
            Parameters
245
            ----------
246
            char : str
247
                The character to lookup
248
249
            Returns
250
            -------
251
            tuple
252
                A keyboard
253
254
            Raises
255
            ------
256
            ValueError
257 1
                char not found in any keyboard layouts
258 1
259 1
            .. versionadded:: 0.3.0
260 1
261 1
            """
262
            for i, kb_mode in enumerate(kb_array):
263 1
                if char in kb_mode:
264 1
                    return keyboard[i]
265 1
            raise ValueError(char + ' not found in any keyboard layouts')
266 1
267 1
        def _substitution_cost(char1: str, char2: str) -> float:
268
            if self._failsafe and (char1 not in keys or char2 not in keys):
269
                return ins_cost + del_cost
270 1
            cost = sub_cost
271
            cost *= metric_dict[self._metric](char1, char2) + shift_cost * (
272 1
                _kb_array_for_char(char1) != _kb_array_for_char(char2)
273
            )
274
            return cost
275
276
        def _get_char_coord(
277
            char: str, kb_array: Tuple[Tuple[str, ...], ...]
278
        ) -> Tuple[int, int]:
279
            """Return the row & column of char in the keyboard.
280
281
            Parameters
282
            ----------
283
            char : str
284
                The character to search for
285
            kb_array : tuple of tuples
286
                The array of key positions
287
288
            Returns
289
            -------
290 1
            tuple
291 1
                The row & column of the key
292 1
293
            .. versionadded:: 0.3.0
294 1
295 1
            """
296 1
            for row in kb_array:  # pragma: no branch
297 1
                if char in row:
298
                    break
299 1
            return kb_array.index(row), row.index(char)
0 ignored issues
show
introduced by
The variable row does not seem to be defined in case the for loop on line 296 is not entered. Are you sure this can never be the case?
Loading history...
300 1
301 1
        def _euclidean_keyboard_distance(char1: str, char2: str) -> float:
302 1
            row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1))
303
            row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2))
304 1
            return ((row1 - row2) ** 2 + (col1 - col2) ** 2) ** 0.5
305 1
306
        def _manhattan_keyboard_distance(char1: str, char2: str) -> float:
307 1
            row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1))
308 1
            row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2))
309
            return abs(row1 - row2) + abs(col1 - col2)
310 1
311
        def _log_euclidean_keyboard_distance(char1: str, char2: str) -> float:
312
            return log(1 + _euclidean_keyboard_distance(char1, char2))
313
314
        def _log_manhattan_keyboard_distance(char1: str, char2: str) -> float:
315
            return log(1 + _manhattan_keyboard_distance(char1, char2))
316
317 1
        metric_dict = {
318 1
            'euclidean': _euclidean_keyboard_distance,
319 1
            'manhattan': _manhattan_keyboard_distance,
320 1
            'log-euclidean': _log_euclidean_keyboard_distance,
321 1
            'log-manhattan': _log_manhattan_keyboard_distance,
322
        }
323 1
324 1
        d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float)
325 1
        for i in range(len(src) + 1):
326
            d_mat[i, 0] = i * del_cost
327
        for j in range(len(tar) + 1):
328
            d_mat[0, j] = j * ins_cost
329
330
        for i in range(len(src)):
331
            for j in range(len(tar)):
332
                d_mat[i + 1, j + 1] = min(
333
                    d_mat[i + 1, j] + ins_cost,  # ins
334
                    d_mat[i, j + 1] + del_cost,  # del
335
                    d_mat[i, j]
336 1
                    + (
337
                        _substitution_cost(src[i], tar[j])
338 1
                        if src[i] != tar[j]
339
                        else 0
340
                    ),  # sub/==
341
                )
342
343
        return cast(float, d_mat[len(src), len(tar)])
344
345
    def dist(self, src: str, tar: str) -> float:
346
        """Return the normalized typo distance between two strings.
347
348
        This is typo distance, normalized to [0, 1].
349
350
        Parameters
351
        ----------
352
        src : str
353
            Source string for comparison
354
        tar : str
355
            Target string for comparison
356
357
        Returns
358
        -------
359
        float
360
            Normalized typo distance
361
362
        Examples
363
        --------
364
        >>> cmp = Typo()
365
        >>> round(cmp.dist('cat', 'hat'), 12)
366
        0.527046276695
367
        >>> round(cmp.dist('Niall', 'Neil'), 12)
368
        0.565028153987
369
        >>> round(cmp.dist('Colin', 'Cuilen'), 12)
370
        0.569035593729
371
        >>> cmp.dist('ATCG', 'TAGC')
372
        0.625
373 1
374 1
375 1
        .. versionadded:: 0.3.0
376 1
        .. versionchanged:: 0.3.6
377
            Encapsulated in class
378
379
        """
380
        if src == tar:
381 1
            return 0.0
382
        ins_cost, del_cost = self._cost[:2]
383
        return self.dist_abs(src, tar) / (
384
            max(len(src) * del_cost, len(tar) * ins_cost)
385
        )
386
387 1
388
if __name__ == '__main__':
389
    import doctest
390
391
    doctest.testmod()
392