abydos.distance._typo.Typo.dist_abs() - Code Metrics - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

abydos.distance._typo.Typo.dist_abs() F
last analyzed 2020-12-31 20:10 UTC

↳ Parent: abydos.distance._typo

Complexity

Conditions

Size

Total Lines	188
Code Lines	71

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	58
CRAP Score	21

Importance

Changes

Metric	Value
eloc	71
dl	0
loc	188
ccs	58
cts	58
cp	1
rs	0
c	0
b	0
f	0
cc	21
nop	3
crap	21

How to fix Long Method Complexity

# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._typo.

Typo edit distance functions.
"""

from itertools import chain
from math import log
from typing import Any, Dict, Tuple, cast

from numpy import float_ as np_float
from numpy import zeros as np_zeros

from ._distance import _Distance


__all__ = ['Typo']


class Typo(_Distance):
    """Typo distance.

    This is inspired by Typo-Distance :cite:`Song:2011`, and a fair bit of
    this was copied from that module. Compared to the original, this supports
    different metrics for substitution.

    .. versionadded:: 0.3.6
    """

    # fmt: off
    _keyboard = {'QWERTY': (
        (('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '-', '='),
         ('', 'q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '[', ']',
          '\\'),
         ('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', ';', "'"),
         ('', 'z', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '/'),
         ('', '', '', ' ')),
        (('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+'),
         ('', 'Q', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '{', '}', '|'),
         ('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', ':', '"'),
         ('', 'Z', 'X', 'C', 'V', 'B', 'N', 'M', '<', '>', '?'),
         ('', '', '', ' '))
    ), 'Dvorak': (
        (('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '[', ']'),
         ('', "'", ',', '.', 'p', 'y', 'f', 'g', 'c', 'r', 'l', '/', '=',
          '\\'),
         ('', 'a', 'o', 'e', 'u', 'i', 'd', 'h', 't', 'n', 's', '-'),
         ('', ';', 'q', 'j', 'k', 'x', 'b', 'm', 'w', 'v', 'z'),
         ('', '', '', ' ')),
        (('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '{', '}'),
         ('', '"', '<', '>', 'P', 'Y', 'F', 'G', 'C', 'R', 'L', '?', '+', '|'),
         ('', 'A', 'O', 'E', 'U', 'I', 'D', 'H', 'T', 'N', 'S', '_'),
         ('', ':', 'Q', 'J', 'K', 'X', 'B', 'M', 'W', 'V', 'Z'),
         ('', '', '', ' '))
    ), 'AZERTY': (
        (('²', '&', 'é', '"', "'", '(', '-', 'è', '_', 'ç', 'à', ')', '='),
         ('', 'a', 'z', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '', '$'),
         ('', 'q', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'ù', '*'),
         ('<', 'w', 'x', 'c', 'v', 'b', 'n', ',', ';', ':', '!'),
         ('', '', '', ' ')),
        (('~', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '°', '+'),
         ('', 'A', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '', '£'),
         ('', 'Q', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'Ù', 'μ'),
         ('>', 'W', 'X', 'C', 'V', 'B', 'N', '?', '.', '/', '§'),
         ('', '', '', ' '))
    ), 'QWERTZ': (
        (('', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'ß', ''),
         ('', 'q', 'w', 'e', 'r', 't', 'z', 'u', 'i', 'o', 'p', ' ü', '+',
          '\\'),
         ('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'ö', 'ä', '#'),
         ('<', 'y', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '-'),
         ('', '', '', ' ')),
        (('°', '!', '"', '§', '$', '%', '&', '/', '(', ')', '=', '?', ''),
         ('', 'Q', 'W', 'E', 'R', 'T', 'Z', 'U', 'I', 'O', 'P', 'Ü', '*', ''),
         ('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Ö', 'Ä', "'"),
         ('>', 'Y', 'X', 'C', 'V', 'B', 'N', 'M', ';', ':', '_'),
         ('', '', '', ' '))
    ), 'QWERTZ_HUN': (
        (('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'ö', 'ü', 'ó'),
         ('', 'q', 'w', 'e', 'r', 't', 'z', 'u', 'i', 'o', 'p', ' ő', 'ú',
          '\\'),
         ('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'é', 'á', 'ű'),
         ('í', 'y', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '-'),
         ('', '', '', ' ')),
        (('§', "'", '"', '+', '!', '%', '/', '=', '(', ')', 'Ö', 'Ü', 'Ó'),
         ('', 'Q', 'W', 'E', 'R', 'T', 'Z', 'U', 'I', 'O', 'P', 'Ő', 'Ú', ''),
         ('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'É', 'Á', 'Ű'),
         ('Í', 'Y', 'X', 'C', 'V', 'B', 'N', 'M', '?', ':', '_'),
         ('', '', '', ' ')),
        (('', '~', 'ˇ', '^', '˘', '°', '˛', '`', '˙', '´', '˝', '¨', '¸'),
         ('', '\\', '|', '', '', '', '', '€', '', '', '', '÷', '×', ''),
         ('', '', 'đ', 'Đ', '[', ']', '', '', 'ł', 'Ł', '$', 'ß', '¤'),
         ('<', '>', '#', '&', '@', '{', '}', '', ';', '>', '*'),
         ('', '', '', ' '))
    )}  # type: Dict[str, Tuple[Tuple[Tuple[str, ...], ...], ...]]
    # fmt: on

    def __init__(
        self,
        metric: str = 'euclidean',
        cost: Tuple[float, float, float, float] = (1.0, 1.0, 0.5, 0.5),
        layout: str = 'QWERTY',
        failsafe: bool = False,
        **kwargs: Any
    ):
        """Initialize Typo instance.

        Parameters
        ----------
        metric : str
            Supported values include: ``euclidean``, ``manhattan``,
            ``log-euclidean``, and ``log-manhattan``
        cost : tuple
            A 4-tuple representing the cost of the four possible edits:
            inserts, deletes, substitutions, and shift, respectively (by
            default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be
            significantly less than the cost of an insertion & deletion unless
            a log metric is used.
        layout : str
            Name of the keyboard layout to use (Currently supported:
            ``QWERTY``, ``Dvorak``, ``AZERTY``, ``QWERTZ``, ``auto``). If
            ``auto`` is selected, the class will attempt to determine an
            appropriate keyboard based on the supplied words.
        failsafe : bool
            If True, substitution of an unknown character (one not present on
            the selected keyboard) will incur a cost equal to an insertion plus
            a deletion.
        **kwargs
            Arbitrary keyword arguments


        .. versionadded:: 0.4.0

        """
        super(Typo, self).__init__(**kwargs)
        self._metric = metric
        self._cost = cost
        self._layout = layout
        self._failsafe = failsafe

    def dist_abs(self, src: str, tar: str) -> float:
        """Return the typo distance between two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            Typo distance

        Raises
        ------
        ValueError
            char not found in any keyboard layouts

        Examples
        --------
        >>> cmp = Typo()
        >>> cmp.dist_abs('cat', 'hat')
        1.5811388300841898
        >>> cmp.dist_abs('Niall', 'Neil')
        2.8251407699364424
        >>> cmp.dist_abs('Colin', 'Cuilen')
        3.414213562373095
        >>> cmp.dist_abs('ATCG', 'TAGC')
        2.5

        >>> cmp = Typo(metric='manhattan')
        >>> cmp.dist_abs('cat', 'hat')
        2.0
        >>> cmp.dist_abs('Niall', 'Neil')
        3.0
        >>> cmp.dist_abs('Colin', 'Cuilen')
        3.5
        >>> cmp.dist_abs('ATCG', 'TAGC')
        2.5

        >>> cmp = Typo(metric='log-manhattan')
        >>> cmp.dist_abs('cat', 'hat')
        0.8047189562170501
        >>> cmp.dist_abs('Niall', 'Neil')
        2.2424533248940004
        >>> cmp.dist_abs('Colin', 'Cuilen')
        2.242453324894
        >>> cmp.dist_abs('ATCG', 'TAGC')
        2.3465735902799727


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        ins_cost, del_cost, sub_cost, shift_cost = self._cost

        if src == tar:
            return 0.0
        if not src:
            return len(tar) * ins_cost
        if not tar:
            return len(src) * del_cost

        if self._layout == 'auto':
            for kb in ['QWERTY', 'QWERTZ', 'AZERTY']:
                keys = set(chain(*chain(*self._keyboard[kb])))
                letters = set(src) | set(tar)
                if not (letters - keys):
                    keyboard = self._keyboard[kb]
                    break
            else:
                # Fallback to QWERTY
                keyboard = self._keyboard['QWERTY']
        else:
            keyboard = self._keyboard[self._layout]

        kb_array = []
        for kb_mode in keyboard:
            kb_array.append({item for sublist in kb_mode for item in sublist})
        keys = set(chain(*chain(*keyboard)))

        def _kb_array_for_char(char: str) -> Tuple[Tuple[str, ...], ...]:
            """Return the keyboard layout that contains ch.

            Parameters
            ----------
            char : str
                The character to lookup

            Returns
            -------
            tuple
                A keyboard

            Raises
            ------
            ValueError
                char not found in any keyboard layouts

            .. versionadded:: 0.3.0

            """
            for i, kb_mode in enumerate(kb_array):
                if char in kb_mode:
                    return keyboard[i]
            raise ValueError(char + ' not found in any keyboard layouts')

        def _substitution_cost(char1: str, char2: str) -> float:
            if self._failsafe and (char1 not in keys or char2 not in keys):
                return ins_cost + del_cost
            cost = sub_cost
            cost *= metric_dict[self._metric](char1, char2) + shift_cost * (
                _kb_array_for_char(char1) != _kb_array_for_char(char2)
            )
            return cost

        def _get_char_coord(
            char: str, kb_array: Tuple[Tuple[str, ...], ...]
        ) -> Tuple[int, int]:
            """Return the row & column of char in the keyboard.

            Parameters
            ----------
            char : str
                The character to search for
            kb_array : tuple of tuples
                The array of key positions

            Returns
            -------
            tuple
                The row & column of the key

            .. versionadded:: 0.3.0

            """
            for row in kb_array:  # pragma: no branch
                if char in row:
                    break
            return kb_array.index(row), row.index(char)


        def _euclidean_keyboard_distance(char1: str, char2: str) -> float:
            row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1))
            row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2))
            return ((row1 - row2) ** 2 + (col1 - col2) ** 2) ** 0.5

        def _manhattan_keyboard_distance(char1: str, char2: str) -> float:
            row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1))
            row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2))
            return abs(row1 - row2) + abs(col1 - col2)

        def _log_euclidean_keyboard_distance(char1: str, char2: str) -> float:
            return log(1 + _euclidean_keyboard_distance(char1, char2))

        def _log_manhattan_keyboard_distance(char1: str, char2: str) -> float:
            return log(1 + _manhattan_keyboard_distance(char1, char2))

        metric_dict = {
            'euclidean': _euclidean_keyboard_distance,
            'manhattan': _manhattan_keyboard_distance,
            'log-euclidean': _log_euclidean_keyboard_distance,
            'log-manhattan': _log_manhattan_keyboard_distance,
        }

        d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float)
        for i in range(len(src) + 1):
            d_mat[i, 0] = i * del_cost
        for j in range(len(tar) + 1):
            d_mat[0, j] = j * ins_cost

        for i in range(len(src)):
            for j in range(len(tar)):
                d_mat[i + 1, j + 1] = min(
                    d_mat[i + 1, j] + ins_cost,  # ins
                    d_mat[i, j + 1] + del_cost,  # del
                    d_mat[i, j]
                    + (
                        _substitution_cost(src[i], tar[j])
                        if src[i] != tar[j]
                        else 0
                    ),  # sub/==
                )

        return cast(float, d_mat[len(src), len(tar)])

    def dist(self, src: str, tar: str) -> float:
        """Return the normalized typo distance between two strings.

        This is typo distance, normalized to [0, 1].

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            Normalized typo distance

        Examples
        --------
        >>> cmp = Typo()
        >>> round(cmp.dist('cat', 'hat'), 12)
        0.527046276695
        >>> round(cmp.dist('Niall', 'Neil'), 12)
        0.565028153987
        >>> round(cmp.dist('Colin', 'Cuilen'), 12)
        0.569035593729
        >>> cmp.dist('ATCG', 'TAGC')
        0.625


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        if src == tar:
            return 0.0
        ins_cost, del_cost = self._cost[:2]
        return self.dist_abs(src, tar) / (
            max(len(src) * del_cost, len(tar) * ins_cost)
        )


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# Copyright 2018-2020 by Christopher C. Little.
2		# This file is part of Abydos.
3		#
4		# Abydos is free software: you can redistribute it and/or modify
5		# it under the terms of the GNU General Public License as published by
6		# the Free Software Foundation, either version 3 of the License, or
7		# (at your option) any later version.
8		#
9		# Abydos is distributed in the hope that it will be useful,
10		# but WITHOUT ANY WARRANTY; without even the implied warranty of
11		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12		# GNU General Public License for more details.
13		#
14		# You should have received a copy of the GNU General Public License
15		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17		"""abydos.distance._typo.
18
19	1	Typo edit distance functions.
20		"""
21
22		from itertools import chain
23		from math import log
24	1	from typing import Any, Dict, Tuple, cast
25
26		from numpy import float_ as np_float
27		from numpy import zeros as np_zeros
28
29		from ._distance import _Distance
30
31	1
32	1	__all__ = ['Typo']
33
34	1
35		class Typo(_Distance):
36	1	"""Typo distance.
37	1
38		This is inspired by Typo-Distance :cite:`Song:2011`, and a fair bit of
39	1	this was copied from that module. Compared to the original, this supports
40		different metrics for substitution.
41	1
42	1	.. versionadded:: 0.3.6
43		"""
44
45	1	# fmt: off
46		_keyboard = {'QWERTY': (
47		(('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '-', '='),
48	1	('', 'q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '[', ']',
49		'\\'),
50		('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', ';', "'"),
51		('', 'z', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '/'),
52		('', '', '', ' ')),
53		(('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+'),
54		('', 'Q', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '{', '}', '\|'),
55		('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', ':', '"'),
56		('', 'Z', 'X', 'C', 'V', 'B', 'N', 'M', '<', '>', '?'),
57		('', '', '', ' '))
58		), 'Dvorak': (
59	1	(('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '[', ']'),
60		('', "'", ',', '.', 'p', 'y', 'f', 'g', 'c', 'r', 'l', '/', '=',
61		'\\'),
62		('', 'a', 'o', 'e', 'u', 'i', 'd', 'h', 't', 'n', 's', '-'),
63		('', ';', 'q', 'j', 'k', 'x', 'b', 'm', 'w', 'v', 'z'),
64		('', '', '', ' ')),
65		(('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '{', '}'),
66		('', '"', '<', '>', 'P', 'Y', 'F', 'G', 'C', 'R', 'L', '?', '+', '\|'),
67		('', 'A', 'O', 'E', 'U', 'I', 'D', 'H', 'T', 'N', 'S', '_'),
68		('', ':', 'Q', 'J', 'K', 'X', 'B', 'M', 'W', 'V', 'Z'),
69		('', '', '', ' '))
70		), 'AZERTY': (
71		(('²', '&', 'é', '"', "'", '(', '-', 'è', '_', 'ç', 'à', ')', '='),
72		('', 'a', 'z', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '', '$'),
73		('', 'q', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'ù', '*'),
74		('<', 'w', 'x', 'c', 'v', 'b', 'n', ',', ';', ':', '!'),
75		('', '', '', ' ')),
76		(('~', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '°', '+'),
77		('', 'A', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '', '£'),
78		('', 'Q', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'Ù', 'μ'),
79		('>', 'W', 'X', 'C', 'V', 'B', 'N', '?', '.', '/', '§'),
80		('', '', '', ' '))
81		), 'QWERTZ': (
82		(('', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'ß', ''),
83		('', 'q', 'w', 'e', 'r', 't', 'z', 'u', 'i', 'o', 'p', ' ü', '+',
84		'\\'),
85		('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'ö', 'ä', '#'),
86		('<', 'y', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '-'),
87		('', '', '', ' ')),
88		(('°', '!', '"', '§', '$', '%', '&', '/', '(', ')', '=', '?', ''),
89		('', 'Q', 'W', 'E', 'R', 'T', 'Z', 'U', 'I', 'O', 'P', 'Ü', '*', ''),
90		('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Ö', 'Ä', "'"),
91		('>', 'Y', 'X', 'C', 'V', 'B', 'N', 'M', ';', ':', '_'),
92		('', '', '', ' '))
93		), 'QWERTZ_HUN': (
94		(('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'ö', 'ü', 'ó'),
95		('', 'q', 'w', 'e', 'r', 't', 'z', 'u', 'i', 'o', 'p', ' ő', 'ú',
96		'\\'),
97		('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'é', 'á', 'ű'),
98		('í', 'y', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '-'),
99		('', '', '', ' ')),
100		(('§', "'", '"', '+', '!', '%', '/', '=', '(', ')', 'Ö', 'Ü', 'Ó'),
101		('', 'Q', 'W', 'E', 'R', 'T', 'Z', 'U', 'I', 'O', 'P', 'Ő', 'Ú', ''),
102		('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'É', 'Á', 'Ű'),
103		('Í', 'Y', 'X', 'C', 'V', 'B', 'N', 'M', '?', ':', '_'),
104		('', '', '', ' ')),
105		(('', '~', 'ˇ', '^', '˘', '°', '˛', '`', '˙', '´', '˝', '¨', '¸'),
106		('', '\\', '\|', '', '', '', '', '€', '', '', '', '÷', '×', ''),
107		('', '', 'đ', 'Đ', '[', ']', '', '', 'ł', 'Ł', '$', 'ß', '¤'),
108		('<', '>', '#', '&', '@', '{', '}', '', ';', '>', '*'),
109	1	('', '', '', ' '))
110		)} # type: Dict[str, Tuple[Tuple[Tuple[str, ...], ...], ...]]
111		# fmt: on
112
113		def __init__(
114		self,
115		metric: str = 'euclidean',
116		cost: Tuple[float, float, float, float] = (1.0, 1.0, 0.5, 0.5),
117		layout: str = 'QWERTY',
118		failsafe: bool = False,
119		**kwargs: Any
120		):
121		"""Initialize Typo instance.
122
123		Parameters
124		----------
125		metric : str
126		Supported values include: ``euclidean``, ``manhattan``,
127		``log-euclidean``, and ``log-manhattan``
128		cost : tuple
129		A 4-tuple representing the cost of the four possible edits:
130		inserts, deletes, substitutions, and shift, respectively (by
131		default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be
132		significantly less than the cost of an insertion & deletion unless
133		a log metric is used.
134		layout : str
135		Name of the keyboard layout to use (Currently supported:
136		``QWERTY``, ``Dvorak``, ``AZERTY``, ``QWERTZ``, ``auto``). If
137		``auto`` is selected, the class will attempt to determine an
138		appropriate keyboard based on the supplied words.
139		failsafe : bool
140		If True, substitution of an unknown character (one not present on
141		the selected keyboard) will incur a cost equal to an insertion plus
142		a deletion.
143		**kwargs
144		Arbitrary keyword arguments
145
146	1
147	1	.. versionadded:: 0.4.0
148	1
149	1	"""
150	1	super(Typo, self).__init__(**kwargs)
151		self._metric = metric
152	1	self._cost = cost
153		self._layout = layout
154		self._failsafe = failsafe
155
156		def dist_abs(self, src: str, tar: str) -> float:
157		"""Return the typo distance between two strings.
158
159		Parameters
160		----------
161		src : str
162		Source string for comparison
163		tar : str
164		Target string for comparison
165
166		Returns
167		-------
168		float
169		Typo distance
170
171		Raises
172		------
173		ValueError
174		char not found in any keyboard layouts
175
176		Examples
177		--------
178		>>> cmp = Typo()
179		>>> cmp.dist_abs('cat', 'hat')
180		1.5811388300841898
181		>>> cmp.dist_abs('Niall', 'Neil')
182		2.8251407699364424
183		>>> cmp.dist_abs('Colin', 'Cuilen')
184		3.414213562373095
185		>>> cmp.dist_abs('ATCG', 'TAGC')
186		2.5
187
188		>>> cmp = Typo(metric='manhattan')
189		>>> cmp.dist_abs('cat', 'hat')
190		2.0
191		>>> cmp.dist_abs('Niall', 'Neil')
192		3.0
193		>>> cmp.dist_abs('Colin', 'Cuilen')
194		3.5
195		>>> cmp.dist_abs('ATCG', 'TAGC')
196		2.5
197
198		>>> cmp = Typo(metric='log-manhattan')
199		>>> cmp.dist_abs('cat', 'hat')
200		0.8047189562170501
201		>>> cmp.dist_abs('Niall', 'Neil')
202		2.2424533248940004
203		>>> cmp.dist_abs('Colin', 'Cuilen')
204		2.242453324894
205		>>> cmp.dist_abs('ATCG', 'TAGC')
206		2.3465735902799727
207
208
209		.. versionadded:: 0.3.0
210	1	.. versionchanged:: 0.3.6
211		Encapsulated in class
212	1
213	1	"""
214	1	ins_cost, del_cost, sub_cost, shift_cost = self._cost
215	1
216	1	if src == tar:
217	1	return 0.0
218		if not src:
219	1	return len(tar) * ins_cost
220	1	if not tar:
221	1	return len(src) * del_cost
222	1
223	1	if self._layout == 'auto':
224	1	for kb in ['QWERTY', 'QWERTZ', 'AZERTY']:
225	1	keys = set(chain(chain(self._keyboard[kb])))
226		letters = set(src) \| set(tar)
227		if not (letters - keys):
228	1	keyboard = self._keyboard[kb]
229		break
230	1	else:
231		# Fallback to QWERTY
232	1	keyboard = self._keyboard['QWERTY']
233	1	else:
234	1	keyboard = self._keyboard[self._layout]
235
236	1	kb_array = []
237		for kb_mode in keyboard:
238		kb_array.append({item for sublist in kb_mode for item in sublist})
239		keys = set(chain(chain(keyboard)))
240
241		def _kb_array_for_char(char: str) -> Tuple[Tuple[str, ...], ...]:
242		"""Return the keyboard layout that contains ch.
243
244		Parameters
245		----------
246		char : str
247		The character to lookup
248
249		Returns
250		-------
251		tuple
252		A keyboard
253
254		Raises
255		------
256		ValueError
257	1	char not found in any keyboard layouts
258	1
259	1	.. versionadded:: 0.3.0
260	1
261	1	"""
262		for i, kb_mode in enumerate(kb_array):
263	1	if char in kb_mode:
264	1	return keyboard[i]
265	1	raise ValueError(char + ' not found in any keyboard layouts')
266	1
267	1	def _substitution_cost(char1: str, char2: str) -> float:
268		if self._failsafe and (char1 not in keys or char2 not in keys):
269		return ins_cost + del_cost
270	1	cost = sub_cost
271		cost = metric_dict[self._metric](char1, char2) + shift_cost (
272	1	_kb_array_for_char(char1) != _kb_array_for_char(char2)
273		)
274		return cost
275
276		def _get_char_coord(
277		char: str, kb_array: Tuple[Tuple[str, ...], ...]
278		) -> Tuple[int, int]:
279		"""Return the row & column of char in the keyboard.
280
281		Parameters
282		----------
283		char : str
284		The character to search for
285		kb_array : tuple of tuples
286		The array of key positions
287
288		Returns
289		-------
290	1	tuple
291	1	The row & column of the key
292	1
293		.. versionadded:: 0.3.0
294	1
295	1	"""
296	1	for row in kb_array: # pragma: no branch
297	1	if char in row:
298		break
299	1	return kb_array.index(row), row.index(char)
		0 ignored issues – show introduced 2020-02-09 23:37 UTC by Report Bug Copy Issue Report The variable `row` does not seem to be defined in case the `for` loop on line `296` is not entered. Are you sure this can never be the case? Loading history...
300	1
301	1	def _euclidean_keyboard_distance(char1: str, char2: str) -> float:
302	1	row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1))
303		row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2))
304	1	return ((row1 - row2) 2 + (col1 - col2) 2) ** 0.5
305	1
306		def _manhattan_keyboard_distance(char1: str, char2: str) -> float:
307	1	row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1))
308	1	row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2))
309		return abs(row1 - row2) + abs(col1 - col2)
310	1
311		def _log_euclidean_keyboard_distance(char1: str, char2: str) -> float:
312		return log(1 + _euclidean_keyboard_distance(char1, char2))
313
314		def _log_manhattan_keyboard_distance(char1: str, char2: str) -> float:
315		return log(1 + _manhattan_keyboard_distance(char1, char2))
316
317	1	metric_dict = {
318	1	'euclidean': _euclidean_keyboard_distance,
319	1	'manhattan': _manhattan_keyboard_distance,
320	1	'log-euclidean': _log_euclidean_keyboard_distance,
321	1	'log-manhattan': _log_manhattan_keyboard_distance,
322		}
323	1
324	1	d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float)
325	1	for i in range(len(src) + 1):
326		d_mat[i, 0] = i * del_cost
327		for j in range(len(tar) + 1):
328		d_mat[0, j] = j * ins_cost
329
330		for i in range(len(src)):
331		for j in range(len(tar)):
332		d_mat[i + 1, j + 1] = min(
333		d_mat[i + 1, j] + ins_cost, # ins
334		d_mat[i, j + 1] + del_cost, # del
335		d_mat[i, j]
336	1	+ (
337		_substitution_cost(src[i], tar[j])
338	1	if src[i] != tar[j]
339		else 0
340		), # sub/==
341		)
342
343		return cast(float, d_mat[len(src), len(tar)])
344
345		def dist(self, src: str, tar: str) -> float:
346		"""Return the normalized typo distance between two strings.
347
348		This is typo distance, normalized to [0, 1].
349
350		Parameters
351		----------
352		src : str
353		Source string for comparison
354		tar : str
355		Target string for comparison
356
357		Returns
358		-------
359		float
360		Normalized typo distance
361
362		Examples
363		--------
364		>>> cmp = Typo()
365		>>> round(cmp.dist('cat', 'hat'), 12)
366		0.527046276695
367		>>> round(cmp.dist('Niall', 'Neil'), 12)
368		0.565028153987
369		>>> round(cmp.dist('Colin', 'Cuilen'), 12)
370		0.569035593729
371		>>> cmp.dist('ATCG', 'TAGC')
372		0.625
373	1
374	1
375	1	.. versionadded:: 0.3.0
376	1	.. versionchanged:: 0.3.6
377		Encapsulated in class
378
379		"""
380		if src == tar:
381	1	return 0.0
382		ins_cost, del_cost = self._cost[:2]
383		return self.dist_abs(src, tar) / (
384		max(len(src) * del_cost, len(tar) * ins_cost)
385		)
386
387	1
388		if __name__ == '__main__':
389		import doctest
390
391		doctest.testmod()
392

chrislit / abydos

abydos.distance._typo.Typo.dist_abs() F last analyzed 2020-12-31 20:10 UTC

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like

abydos.distance._typo.Typo.dist_abs() F
last analyzed 2020-12-31 20:10 UTC