Completed
Push — master ( 3ac297...afe14d )
by Chris
16:40 queued 07:25
created

abydos.distance._typo.Typo.dist_abs()   D

Complexity

Conditions 13

Size

Total Lines 178
Code Lines 64

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 47
CRAP Score 13

Importance

Changes 0
Metric Value
cc 13
eloc 64
nop 6
dl 0
loc 178
ccs 47
cts 47
cp 1
crap 13
rs 4.1181
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like abydos.distance._typo.Typo.dist_abs() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._typo.
20
21
Typo edit distance functions.
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from math import log
32
33 1
from numpy import float32 as np_float32
34 1
from numpy import zeros as np_zeros
35
36 1
from six.moves import range
37
38 1
from ._distance import _Distance
39
40 1
__all__ = ['Typo', 'dist_typo', 'sim_typo', 'typo']
41
42
43 1
class Typo(_Distance):
44
    """Typo distance.
45
46
    This is inspired by Typo-Distance :cite:`Song:2011`, and a fair bit of
47
    this was copied from that module. Compared to the original, this supports
48
    different metrics for substitution.
49
    """
50
51
    # fmt: off
52 1
    _keyboard = {'QWERTY': (
53
        (('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '-', '='),
54
         ('', 'q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '[', ']',
55
          '\\'),
56
         ('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', ';', '\''),
57
         ('', 'z', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '/')),
58
        (('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+'),
59
         ('', 'Q', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '{', '}', '|'),
60
         ('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', ':', '"'),
61
         ('', 'Z', 'X', 'C', 'V', 'B', 'N', 'M', '<', '>', '?'))
62
    ), 'Dvorak': (
63
        (('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '[', ']'),
64
         ('', '\'', ',', '.', 'p', 'y', 'f', 'g', 'c', 'r', 'l', '/', '=',
65
          '\\'),
66
         ('', 'a', 'o', 'e', 'u', 'i', 'd', 'h', 't', 'n', 's', '-'),
67
         ('', ';', 'q', 'j', 'k', 'x', 'b', 'm', 'w', 'v', 'z')),
68
        (('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '{', '}'),
69
         ('', '"', '<', '>', 'P', 'Y', 'F', 'G', 'C', 'R', 'L', '?', '+', '|'),
70
         ('', 'A', 'O', 'E', 'U', 'I', 'D', 'H', 'T', 'N', 'S', '_'),
71
         ('', ':', 'Q', 'J', 'K', 'X', 'B', 'M', 'W', 'V', 'Z'))
72
    ), 'AZERTY': (
73
        (('²', '&', 'é', '"', '\'', '(', '-', 'è', '_', 'ç', 'à', ')', '='),
74
         ('', 'a', 'z', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '', '$'),
75
         ('', 'q', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'ù', '*'),
76
         ('<', 'w', 'x', 'c', 'v', 'b', 'n', ',', ';', ':', '!')),
77
        (('~', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '°', '+'),
78
         ('', 'A', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '', '£'),
79
         ('', 'Q', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'Ù', 'μ'),
80
         ('>', 'W', 'X', 'C', 'V', 'B', 'N', '?', '.', '/', '§'))
81
    ), 'QWERTZ': (
82
        (('', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'ß', ''),
83
         ('', 'q', 'w', 'e', 'r', 't', 'z', 'u', 'i', 'o', 'p', ' ü', '+',
84
          '\\'),
85
         ('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'ö', 'ä', '#'),
86
         ('<', 'y', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '-')),
87
        (('°', '!', '"', '§', '$', '%', '&', '/', '(', ')', '=', '?', ''),
88
         ('', 'Q', 'W', 'E', 'R', 'T', 'Z', 'U', 'I', 'O', 'P', 'Ü', '*', ''),
89
         ('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Ö', 'Ä', '\''),
90
         ('>', 'Y', 'X', 'C', 'V', 'B', 'N', 'M', ';', ':', '_'))
91
    )}
92
    # fmt: on
93
94 1
    def dist_abs(
95
        self,
96
        src,
97
        tar,
98
        metric='euclidean',
99
        cost=(1, 1, 0.5, 0.5),
100
        layout='QWERTY',
101
    ):
102
        """Return the typo distance between two strings.
103
104
        Parameters
105
        ----------
106
        src : str
107
            Source string for comparison
108
        tar : str
109
            Target string for comparison
110
        metric : str
111
            Supported values include: ``euclidean``, ``manhattan``,
112
            ``log-euclidean``, and ``log-manhattan``
113
        cost : tuple
114
            A 4-tuple representing the cost of the four possible edits:
115
            inserts, deletes, substitutions, and shift, respectively (by
116
            default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be
117
            significantly less than the cost of an insertion & deletion unless
118
            a log metric is used.
119
        layout : str
120
            Name of the keyboard layout to use (Currently supported:
121
            ``QWERTY``, ``Dvorak``, ``AZERTY``, ``QWERTZ``)
122
123
        Returns
124
        -------
125
        float
126
            Typo distance
127
128
        Raises
129
        ------
130
        ValueError
131
            char not found in any keyboard layouts
132
133
        Examples
134
        --------
135
        >>> cmp = Typo()
136
        >>> cmp.dist_abs('cat', 'hat')
137
        1.5811388
138
        >>> cmp.dist_abs('Niall', 'Neil')
139
        2.8251407
140
        >>> cmp.dist_abs('Colin', 'Cuilen')
141
        3.4142137
142
        >>> cmp.dist_abs('ATCG', 'TAGC')
143
        2.5
144
145
        >>> cmp.dist_abs('cat', 'hat', metric='manhattan')
146
        2.0
147
        >>> cmp.dist_abs('Niall', 'Neil', metric='manhattan')
148
        3.0
149
        >>> cmp.dist_abs('Colin', 'Cuilen', metric='manhattan')
150
        3.5
151
        >>> cmp.dist_abs('ATCG', 'TAGC', metric='manhattan')
152
        2.5
153
154
        >>> cmp.dist_abs('cat', 'hat', metric='log-manhattan')
155
        0.804719
156
        >>> cmp.dist_abs('Niall', 'Neil', metric='log-manhattan')
157
        2.2424533
158
        >>> cmp.dist_abs('Colin', 'Cuilen', metric='log-manhattan')
159
        2.2424533
160
        >>> cmp.dist_abs('ATCG', 'TAGC', metric='log-manhattan')
161
        2.3465736
162
163
        """
164 1
        ins_cost, del_cost, sub_cost, shift_cost = cost
165
166 1
        if src == tar:
167 1
            return 0.0
168 1
        if not src:
169 1
            return len(tar) * ins_cost
170 1
        if not tar:
171 1
            return len(src) * del_cost
172
173 1
        keyboard = self._keyboard[layout]
174 1
        lowercase = {item for sublist in keyboard[0] for item in sublist}
175 1
        uppercase = {item for sublist in keyboard[1] for item in sublist}
176
177 1
        def _kb_array_for_char(char):
178
            """Return the keyboard layout that contains ch.
179
180
            Parameters
181
            ----------
182
            char : str
183
                The character to lookup
184
185
            Returns
186
            -------
187
            tuple
188
                A keyboard
189
190
            Raises
191
            ------
192
            ValueError
193
                char not found in any keyboard layouts
194
195
            """
196 1
            if char in lowercase:
197 1
                return keyboard[0]
198 1
            elif char in uppercase:
199 1
                return keyboard[1]
200 1
            raise ValueError(char + ' not found in any keyboard layouts')
201
202 1
        def _substitution_cost(char1, char2):
203 1
            cost = sub_cost
204 1
            cost *= metric_dict[metric](char1, char2) + shift_cost * (
205
                _kb_array_for_char(char1) != _kb_array_for_char(char2)
206
            )
207 1
            return cost
208
209 1
        def _get_char_coord(char, kb_array):
210
            """Return the row & column of char in the keyboard.
211
212
            Parameters
213
            ----------
214
            char : str
215
                The character to search for
216
            kb_array : tuple of tuples
217
                The array of key positions
218
219
            Returns
220
            -------
221
            tuple
222
                The row & column of the key
223
224
            """
225 1
            for row in kb_array:  # pragma: no branch
226 1
                if char in row:
227 1
                    return kb_array.index(row), row.index(char)
228
229 1
        def _euclidean_keyboard_distance(char1, char2):
230 1
            row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1))
231 1
            row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2))
232 1
            return ((row1 - row2) ** 2 + (col1 - col2) ** 2) ** 0.5
233
234 1
        def _manhattan_keyboard_distance(char1, char2):
235 1
            row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1))
236 1
            row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2))
237 1
            return abs(row1 - row2) + abs(col1 - col2)
238
239 1
        def _log_euclidean_keyboard_distance(char1, char2):
240 1
            return log(1 + _euclidean_keyboard_distance(char1, char2))
241
242 1
        def _log_manhattan_keyboard_distance(char1, char2):
243 1
            return log(1 + _manhattan_keyboard_distance(char1, char2))
244
245 1
        metric_dict = {
246
            'euclidean': _euclidean_keyboard_distance,
247
            'manhattan': _manhattan_keyboard_distance,
248
            'log-euclidean': _log_euclidean_keyboard_distance,
249
            'log-manhattan': _log_manhattan_keyboard_distance,
250
        }
251
252 1
        d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32)
253 1
        for i in range(len(src) + 1):
254 1
            d_mat[i, 0] = i * del_cost
255 1
        for j in range(len(tar) + 1):
256 1
            d_mat[0, j] = j * ins_cost
257
258 1
        for i in range(len(src)):
259 1
            for j in range(len(tar)):
260 1
                d_mat[i + 1, j + 1] = min(
261
                    d_mat[i + 1, j] + ins_cost,  # ins
262
                    d_mat[i, j + 1] + del_cost,  # del
263
                    d_mat[i, j]
264
                    + (
265
                        _substitution_cost(src[i], tar[j])
266
                        if src[i] != tar[j]
267
                        else 0
268
                    ),  # sub/==
269
                )
270
271 1
        return d_mat[len(src), len(tar)]
272
273 1
    def dist(
274
        self,
275
        src,
276
        tar,
277
        metric='euclidean',
278
        cost=(1, 1, 0.5, 0.5),
279
        layout='QWERTY',
280
    ):
281
        """Return the normalized typo distance between two strings.
282
283
        This is typo distance, normalized to [0, 1].
284
285
        Parameters
286
        ----------
287
        src : str
288
            Source string for comparison
289
        tar : str
290
            Target string for comparison
291
        metric : str
292
            Supported values include: ``euclidean``, ``manhattan``,
293
            ``log-euclidean``, and ``log-manhattan``
294
        cost : tuple
295
            A 4-tuple representing the cost of the four possible edits:
296
            inserts, deletes, substitutions, and shift, respectively (by
297
            default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be
298
            significantly less than the cost of an insertion & deletion unless
299
            a log metric is used.
300
        layout : str
301
            Name of the keyboard layout to use (Currently supported:
302
            ``QWERTY``, ``Dvorak``, ``AZERTY``, ``QWERTZ``)
303
304
        Returns
305
        -------
306
        float
307
            Normalized typo distance
308
309
        Examples
310
        --------
311
        >>> cmp = Typo()
312
        >>> round(cmp.dist('cat', 'hat'), 12)
313
        0.527046283086
314
        >>> round(cmp.dist('Niall', 'Neil'), 12)
315
        0.565028142929
316
        >>> round(cmp.dist('Colin', 'Cuilen'), 12)
317
        0.569035609563
318
        >>> cmp.dist('ATCG', 'TAGC')
319
        0.625
320
321
        """
322 1
        if src == tar:
323 1
            return 0.0
324 1
        ins_cost, del_cost = cost[:2]
325 1
        return self.dist_abs(src, tar, metric, cost, layout) / (
326
            max(len(src) * del_cost, len(tar) * ins_cost)
327
        )
328
329
330 1
def typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5), layout='QWERTY'):
331
    """Return the typo distance between two strings.
332
333
    This is a wrapper for :py:meth:`Typo.typo`.
334
335
    Parameters
336
    ----------
337
    src : str
338
        Source string for comparison
339
    tar : str
340
        Target string for comparison
341
    metric : str
342
        Supported values include: ``euclidean``, ``manhattan``,
343
        ``log-euclidean``, and ``log-manhattan``
344
    cost : tuple
345
        A 4-tuple representing the cost of the four possible edits: inserts,
346
        deletes, substitutions, and shift, respectively (by default:
347
        (1, 1, 0.5, 0.5)) The substitution & shift costs should be
348
        significantly less than the cost of an insertion & deletion unless a
349
        log metric is used.
350
    layout : str
351
        Name of the keyboard layout to use (Currently supported:
352
        ``QWERTY``, ``Dvorak``, ``AZERTY``, ``QWERTZ``)
353
354
    Returns
355
    -------
356
    float
357
        Typo distance
358
359
    Examples
360
    --------
361
    >>> typo('cat', 'hat')
362
    1.5811388
363
    >>> typo('Niall', 'Neil')
364
    2.8251407
365
    >>> typo('Colin', 'Cuilen')
366
    3.4142137
367
    >>> typo('ATCG', 'TAGC')
368
    2.5
369
370
    >>> typo('cat', 'hat', metric='manhattan')
371
    2.0
372
    >>> typo('Niall', 'Neil', metric='manhattan')
373
    3.0
374
    >>> typo('Colin', 'Cuilen', metric='manhattan')
375
    3.5
376
    >>> typo('ATCG', 'TAGC', metric='manhattan')
377
    2.5
378
379
    >>> typo('cat', 'hat', metric='log-manhattan')
380
    0.804719
381
    >>> typo('Niall', 'Neil', metric='log-manhattan')
382
    2.2424533
383
    >>> typo('Colin', 'Cuilen', metric='log-manhattan')
384
    2.2424533
385
    >>> typo('ATCG', 'TAGC', metric='log-manhattan')
386
    2.3465736
387
388
    """
389 1
    return Typo().dist_abs(src, tar, metric, cost, layout)
390
391
392 1
def dist_typo(
393
    src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5), layout='QWERTY'
394
):
395
    """Return the normalized typo distance between two strings.
396
397
    This is a wrapper for :py:meth:`Typo.dist`.
398
399
    Parameters
400
    ----------
401
    src : str
402
        Source string for comparison
403
    tar : str
404
        Target string for comparison
405
    metric : str
406
        Supported values include: ``euclidean``, ``manhattan``,
407
        ``log-euclidean``, and ``log-manhattan``
408
    cost : tuple
409
        A 4-tuple representing the cost of the four possible edits: inserts,
410
        deletes, substitutions, and shift, respectively (by default:
411
        (1, 1, 0.5, 0.5)) The substitution & shift costs should be
412
        significantly less than the cost of an insertion & deletion unless a
413
        log metric is used.
414
    layout : str
415
        Name of the keyboard layout to use (Currently supported:
416
        ``QWERTY``, ``Dvorak``, ``AZERTY``, ``QWERTZ``)
417
418
    Returns
419
    -------
420
    float
421
        Normalized typo distance
422
423
    Examples
424
    --------
425
    >>> round(dist_typo('cat', 'hat'), 12)
426
    0.527046283086
427
    >>> round(dist_typo('Niall', 'Neil'), 12)
428
    0.565028142929
429
    >>> round(dist_typo('Colin', 'Cuilen'), 12)
430
    0.569035609563
431
    >>> dist_typo('ATCG', 'TAGC')
432
    0.625
433
434
    """
435 1
    return Typo().dist(src, tar, metric, cost, layout)
436
437
438 1
def sim_typo(
439
    src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5), layout='QWERTY'
440
):
441
    """Return the normalized typo similarity between two strings.
442
443
    This is a wrapper for :py:meth:`Typo.sim`.
444
445
    Parameters
446
    ----------
447
    src : str
448
        Source string for comparison
449
    tar : str
450
        Target string for comparison
451
    metric : str
452
        Supported values include: ``euclidean``, ``manhattan``,
453
        ``log-euclidean``, and ``log-manhattan``
454
    cost : tuple
455
        A 4-tuple representing the cost of the four possible edits: inserts,
456
        deletes, substitutions, and shift, respectively (by default:
457
        (1, 1, 0.5, 0.5)) The substitution & shift costs should be
458
        significantly less than the cost of an insertion & deletion unless a
459
        log metric is used.
460
    layout : str
461
        Name of the keyboard layout to use (Currently supported:
462
        ``QWERTY``, ``Dvorak``, ``AZERTY``, ``QWERTZ``)
463
464
    Returns
465
    -------
466
    float
467
        Normalized typo similarity
468
469
    Examples
470
    --------
471
    >>> round(sim_typo('cat', 'hat'), 12)
472
    0.472953716914
473
    >>> round(sim_typo('Niall', 'Neil'), 12)
474
    0.434971857071
475
    >>> round(sim_typo('Colin', 'Cuilen'), 12)
476
    0.430964390437
477
    >>> sim_typo('ATCG', 'TAGC')
478
    0.375
479
480
    """
481 1
    return Typo().sim(src, tar, metric, cost, layout)
482
483
484
if __name__ == '__main__':
485
    import doctest
486
487
    doctest.testmod()
488