Completed
Branch master (78a222)
by Chris
14:36
created

abydos.distance._typo   A

Complexity

Total Complexity 16

Size/Duplication

Total Lines 274
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
wmc 16
eloc 116
dl 0
loc 274
ccs 62
cts 62
cp 1
rs 10
c 0
b 0
f 0

3 Functions

Rating   Name   Duplication   Size   Complexity  
A sim_typo() 0 28 1
F typo() 0 168 13
A dist_typo() 0 31 2
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance.typo.
20
21
The distance.typo module implements typo edit distance functions.
22
"""
23
24 1
from __future__ import division, unicode_literals
25
26 1
from math import log
27
28 1
from numpy import float32 as np_float32
29 1
from numpy import zeros as np_zeros
30
31 1
from six.moves import range
32
33 1
__all__ = ['dist_typo', 'sim_typo', 'typo']
34
35
36 1
def typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5), layout='QWERTY'):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (24/15).
Loading history...
37
    """Return the typo distance between two strings.
38
39
    This is inspired by Typo-Distance :cite:`Song:2011`, and a fair bit of
40
    this was copied from that module. Compared to the original, this supports
41
    different metrics for substitution.
42
43
    :param str src: source string for comparison
44
    :param str tar: target string for comparison
45
    :param str metric: supported values include: 'euclidean', 'manhattan',
46
          'log-euclidean', and 'log-manhattan'
47
    :param tuple cost: a 4-tuple representing the cost of the four possible
48
        edits: inserts, deletes, substitutions, and shift, respectively (by
49
        default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be
50
        significantly less than the cost of an insertion & deletion unless
51
        a log metric is used.
52
    :param str layout: name of the keyboard layout to use (Currently supported:
53
        QWERTY, Dvorak, AZERTY, QWERTZ)
54
    :returns: typo distance
55
    :rtype: float
56
57
    >>> typo('cat', 'hat')
58
    1.5811388
59
    >>> typo('Niall', 'Neil')
60
    2.8251407
61
    >>> typo('Colin', 'Cuilen')
62
    3.4142137
63
    >>> typo('ATCG', 'TAGC')
64
    2.5
65
66
    >>> typo('cat', 'hat', metric='manhattan')
67
    2.0
68
    >>> typo('Niall', 'Neil', metric='manhattan')
69
    3.0
70
    >>> typo('Colin', 'Cuilen', metric='manhattan')
71
    3.5
72
    >>> typo('ATCG', 'TAGC', metric='manhattan')
73
    2.5
74
75
    >>> typo('cat', 'hat', metric='log-manhattan')
76
    0.804719
77
    >>> typo('Niall', 'Neil', metric='log-manhattan')
78
    2.2424533
79
    >>> typo('Colin', 'Cuilen', metric='log-manhattan')
80
    2.2424533
81
    >>> typo('ATCG', 'TAGC', metric='log-manhattan')
82
    2.3465736
83
    """
84 1
    ins_cost, del_cost, sub_cost, shift_cost = cost
85
86 1
    if src == tar:
87 1
        return 0.0
88 1
    if not src:
89 1
        return len(tar) * ins_cost
90 1
    if not tar:
91 1
        return len(src) * del_cost
92
93
    # fmt: off
94 1
    kbs = {'QWERTY': (
95
        (('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '-', '='),
96
         ('', 'q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '[', ']',
97
          '\\'),
98
         ('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', ';', '\''),
99
         ('', 'z', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '/')),
100
        (('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+'),
101
         ('', 'Q', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '{', '}', '|'),
102
         ('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', ':', '"'),
103
         ('', 'Z', 'X', 'C', 'V', 'B', 'N', 'M', '<', '>', '?'))
104
    ), 'Dvorak': (
105
        (('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '[', ']'),
106
         ('', '\'', ',', '.', 'p', 'y', 'f', 'g', 'c', 'r', 'l', '/', '=',
107
          '\\'),
108
         ('', 'a', 'o', 'e', 'u', 'i', 'd', 'h', 't', 'n', 's', '-'),
109
         ('', ';', 'q', 'j', 'k', 'x', 'b', 'm', 'w', 'v', 'z')),
110
        (('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '{', '}'),
111
         ('', '"', '<', '>', 'P', 'Y', 'F', 'G', 'C', 'R', 'L', '?', '+', '|'),
112
         ('', 'A', 'O', 'E', 'U', 'I', 'D', 'H', 'T', 'N', 'S', '_'),
113
         ('', ':', 'Q', 'J', 'K', 'X', 'B', 'M', 'W', 'V', 'Z'))
114
    ), 'AZERTY': (
115
        (('²', '&', 'é', '"', '\'', '(', '-', 'è', '_', 'ç', 'à', ')', '='),
116
         ('', 'a', 'z', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '', '$'),
117
         ('', 'q', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'ù', '*'),
118
         ('<', 'w', 'x', 'c', 'v', 'b', 'n', ',', ';', ':', '!')),
119
        (('~', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '°', '+'),
120
         ('', 'A', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '', '£'),
121
         ('', 'Q', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'Ù', 'μ'),
122
         ('>', 'W', 'X', 'C', 'V', 'B', 'N', '?', '.', '/', '§'))
123
    ), 'QWERTZ': (
124
        (('', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'ß', ''),
125
         ('', 'q', 'w', 'e', 'r', 't', 'z', 'u', 'i', 'o', 'p', ' ü', '+',
126
          '\\'),
127
         ('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'ö', 'ä', '#'),
128
         ('<', 'y', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '-')),
129
        (('°', '!', '"', '§', '$', '%', '&', '/', '(', ')', '=', '?', ''),
130
         ('', 'Q', 'W', 'E', 'R', 'T', 'Z', 'U', 'I', 'O', 'P', 'Ü', '*', ''),
131
         ('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Ö', 'Ä', '\''),
132
         ('>', 'Y', 'X', 'C', 'V', 'B', 'N', 'M', ';', ':', '_'))
133
    )}
134
    # fmt: on
135
136 1
    keyboard = kbs[layout]
137 1
    lowercase = {item for sublist in keyboard[0] for item in sublist}
138 1
    uppercase = {item for sublist in keyboard[1] for item in sublist}
139
140 1
    def _kb_array_for_char(char):
141
        """Return the keyboard layout that contains ch."""
142 1
        if char in lowercase:
143 1
            return keyboard[0]
144 1
        elif char in uppercase:
145 1
            return keyboard[1]
146 1
        raise ValueError(char + ' not found in any keyboard layouts')
147
148 1
    def _get_char_coord(char, kb_array):
149
        """Return the row & column of char in the keyboard."""
150 1
        for row in kb_array:  # pragma: no branch
151 1
            if char in row:
152 1
                return kb_array.index(row), row.index(char)
153
154 1
    def _euclidean_keyboard_distance(char1, char2):
155 1
        row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1))
156 1
        row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2))
157 1
        return ((row1 - row2) ** 2 + (col1 - col2) ** 2) ** 0.5
158
159 1
    def _manhattan_keyboard_distance(char1, char2):
160 1
        row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1))
161 1
        row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2))
162 1
        return abs(row1 - row2) + abs(col1 - col2)
163
164 1
    def _log_euclidean_keyboard_distance(char1, char2):
165 1
        return log(1 + _euclidean_keyboard_distance(char1, char2))
166
167 1
    def _log_manhattan_keyboard_distance(char1, char2):
168 1
        return log(1 + _manhattan_keyboard_distance(char1, char2))
169
170 1
    metric_dict = {
171
        'euclidean': _euclidean_keyboard_distance,
172
        'manhattan': _manhattan_keyboard_distance,
173
        'log-euclidean': _log_euclidean_keyboard_distance,
174
        'log-manhattan': _log_manhattan_keyboard_distance,
175
    }
176
177 1
    def _substitution_cost(char1, char2):
178 1
        cost = sub_cost
179 1
        cost *= metric_dict[metric](char1, char2) + shift_cost * (
180
            _kb_array_for_char(char1) != _kb_array_for_char(char2)
181
        )
182 1
        return cost
183
184 1
    d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32)
185 1
    for i in range(len(src) + 1):
186 1
        d_mat[i, 0] = i * del_cost
187 1
    for j in range(len(tar) + 1):
188 1
        d_mat[0, j] = j * ins_cost
189
190 1
    for i in range(len(src)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
191 1
        for j in range(len(tar)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
192 1
            d_mat[i + 1, j + 1] = min(
193
                d_mat[i + 1, j] + ins_cost,  # ins
194
                d_mat[i, j + 1] + del_cost,  # del
195
                d_mat[i, j]
196
                + (
197
                    _substitution_cost(src[i], tar[j])
198
                    if src[i] != tar[j]
199
                    else 0
200
                ),  # sub/==
201
            )
202
203 1
    return d_mat[len(src), len(tar)]
204
205
206 1
def dist_typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5)):
207
    """Return the normalized typo distance between two strings.
208
209
    This is typo distance, normalized to [0, 1].
210
211
    :param str src: source string for comparison
212
    :param str tar: target string for comparison
213
    :param str metric: supported values include: 'euclidean', 'manhattan',
214
          'log-euclidean', and 'log-manhattan'
215
    :param tuple cost: a 4-tuple representing the cost of the four possible
216
        edits: inserts, deletes, substitutions, and shift, respectively (by
217
        default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be
218
        significantly less than the cost of an insertion & deletion unless
219
        a log metric is used.
220
    :returns: normalized typo distance
221
    :rtype: float
222
223
    >>> round(dist_typo('cat', 'hat'), 12)
224
    0.527046283086
225
    >>> round(dist_typo('Niall', 'Neil'), 12)
226
    0.565028142929
227
    >>> round(dist_typo('Colin', 'Cuilen'), 12)
228
    0.569035609563
229
    >>> dist_typo('ATCG', 'TAGC')
230
    0.625
231
    """
232 1
    if src == tar:
233 1
        return 0
234 1
    ins_cost, del_cost = cost[:2]
235 1
    return typo(src, tar, metric, cost) / (
236
        max(len(src) * del_cost, len(tar) * ins_cost)
237
    )
238
239
240 1
def sim_typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5)):
241
    """Return the normalized typo similarity between two strings.
242
243
    Normalized typo similarity is the complement of normalized typo distance:
244
    :math:`sim_{typo} = 1 - dist_{typo}`.
245
246
    :param str src: source string for comparison
247
    :param str tar: target string for comparison
248
    :param str metric: supported values include: 'euclidean', 'manhattan',
249
          'log-euclidean', and 'log-manhattan'
250
    :param tuple cost: a 4-tuple representing the cost of the four possible
251
        edits: inserts, deletes, substitutions, and shift, respectively (by
252
        default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be
253
        significantly less than the cost of an insertion & deletion unless
254
        a log metric is used.
255
    :returns: normalized typo similarity
256
    :rtype: float
257
258
    >>> round(sim_typo('cat', 'hat'), 12)
259
    0.472953716914
260
    >>> round(sim_typo('Niall', 'Neil'), 12)
261
    0.434971857071
262
    >>> round(sim_typo('Colin', 'Cuilen'), 12)
263
    0.430964390437
264
    >>> sim_typo('ATCG', 'TAGC')
265
    0.375
266
    """
267 1
    return 1 - dist_typo(src, tar, metric, cost)
268
269
270
if __name__ == '__main__':
271
    import doctest
272
273
    doctest.testmod()
274