abydos.distance._typo.Typo.dist_abs() - Code Metrics - Inspection of "started new entry in HISTORY for 0.4.0" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 3ac297...afe14d )

by Chris

created 2019-06-01 01:11 UTC

abydos.distance._typo.Typo.dist_abs() D

↳ Parent: abydos.distance._typo

Complexity

Conditions

Size

Total Lines	178
Code Lines	64

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	47
CRAP Score	13

Importance

Changes

Metric	Value
cc	13
eloc	64
nop	6
dl	0
loc	178
ccs	47
cts	47
cp	1
crap	13
rs	4.1181
c	0
b	0
f	0

How to fix Long Method Complexity

# -*- coding: utf-8 -*-

# Copyright 2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._typo.

Typo edit distance functions.
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from math import log

from numpy import float32 as np_float32
from numpy import zeros as np_zeros

from six.moves import range

from ._distance import _Distance

__all__ = ['Typo', 'dist_typo', 'sim_typo', 'typo']


class Typo(_Distance):
    """Typo distance.

    This is inspired by Typo-Distance :cite:`Song:2011`, and a fair bit of
    this was copied from that module. Compared to the original, this supports
    different metrics for substitution.
    """

    # fmt: off
    _keyboard = {'QWERTY': (
        (('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '-', '='),
         ('', 'q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '[', ']',
          '\\'),
         ('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', ';', '\''),
         ('', 'z', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '/')),
        (('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+'),
         ('', 'Q', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '{', '}', '|'),
         ('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', ':', '"'),
         ('', 'Z', 'X', 'C', 'V', 'B', 'N', 'M', '<', '>', '?'))
    ), 'Dvorak': (
        (('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '[', ']'),
         ('', '\'', ',', '.', 'p', 'y', 'f', 'g', 'c', 'r', 'l', '/', '=',
          '\\'),
         ('', 'a', 'o', 'e', 'u', 'i', 'd', 'h', 't', 'n', 's', '-'),
         ('', ';', 'q', 'j', 'k', 'x', 'b', 'm', 'w', 'v', 'z')),
        (('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '{', '}'),
         ('', '"', '<', '>', 'P', 'Y', 'F', 'G', 'C', 'R', 'L', '?', '+', '|'),
         ('', 'A', 'O', 'E', 'U', 'I', 'D', 'H', 'T', 'N', 'S', '_'),
         ('', ':', 'Q', 'J', 'K', 'X', 'B', 'M', 'W', 'V', 'Z'))
    ), 'AZERTY': (
        (('²', '&', 'é', '"', '\'', '(', '-', 'è', '_', 'ç', 'à', ')', '='),
         ('', 'a', 'z', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '', '$'),
         ('', 'q', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'ù', '*'),
         ('<', 'w', 'x', 'c', 'v', 'b', 'n', ',', ';', ':', '!')),
        (('~', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '°', '+'),
         ('', 'A', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '', '£'),
         ('', 'Q', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'Ù', 'μ'),
         ('>', 'W', 'X', 'C', 'V', 'B', 'N', '?', '.', '/', '§'))
    ), 'QWERTZ': (
        (('', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'ß', ''),
         ('', 'q', 'w', 'e', 'r', 't', 'z', 'u', 'i', 'o', 'p', ' ü', '+',
          '\\'),
         ('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'ö', 'ä', '#'),
         ('<', 'y', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '-')),
        (('°', '!', '"', '§', '$', '%', '&', '/', '(', ')', '=', '?', ''),
         ('', 'Q', 'W', 'E', 'R', 'T', 'Z', 'U', 'I', 'O', 'P', 'Ü', '*', ''),
         ('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Ö', 'Ä', '\''),
         ('>', 'Y', 'X', 'C', 'V', 'B', 'N', 'M', ';', ':', '_'))
    )}
    # fmt: on

    def dist_abs(
        self,
        src,
        tar,
        metric='euclidean',
        cost=(1, 1, 0.5, 0.5),
        layout='QWERTY',
    ):
        """Return the typo distance between two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison
        metric : str
            Supported values include: ``euclidean``, ``manhattan``,
            ``log-euclidean``, and ``log-manhattan``
        cost : tuple
            A 4-tuple representing the cost of the four possible edits:
            inserts, deletes, substitutions, and shift, respectively (by
            default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be
            significantly less than the cost of an insertion & deletion unless
            a log metric is used.
        layout : str
            Name of the keyboard layout to use (Currently supported:
            ``QWERTY``, ``Dvorak``, ``AZERTY``, ``QWERTZ``)

        Returns
        -------
        float
            Typo distance

        Raises
        ------
        ValueError
            char not found in any keyboard layouts

        Examples
        --------
        >>> cmp = Typo()
        >>> cmp.dist_abs('cat', 'hat')
        1.5811388
        >>> cmp.dist_abs('Niall', 'Neil')
        2.8251407
        >>> cmp.dist_abs('Colin', 'Cuilen')
        3.4142137
        >>> cmp.dist_abs('ATCG', 'TAGC')
        2.5

        >>> cmp.dist_abs('cat', 'hat', metric='manhattan')
        2.0
        >>> cmp.dist_abs('Niall', 'Neil', metric='manhattan')
        3.0
        >>> cmp.dist_abs('Colin', 'Cuilen', metric='manhattan')
        3.5
        >>> cmp.dist_abs('ATCG', 'TAGC', metric='manhattan')
        2.5

        >>> cmp.dist_abs('cat', 'hat', metric='log-manhattan')
        0.804719
        >>> cmp.dist_abs('Niall', 'Neil', metric='log-manhattan')
        2.2424533
        >>> cmp.dist_abs('Colin', 'Cuilen', metric='log-manhattan')
        2.2424533
        >>> cmp.dist_abs('ATCG', 'TAGC', metric='log-manhattan')
        2.3465736

        """
        ins_cost, del_cost, sub_cost, shift_cost = cost

        if src == tar:
            return 0.0
        if not src:
            return len(tar) * ins_cost
        if not tar:
            return len(src) * del_cost

        keyboard = self._keyboard[layout]
        lowercase = {item for sublist in keyboard[0] for item in sublist}
        uppercase = {item for sublist in keyboard[1] for item in sublist}

        def _kb_array_for_char(char):
            """Return the keyboard layout that contains ch.

            Parameters
            ----------
            char : str
                The character to lookup

            Returns
            -------
            tuple
                A keyboard

            Raises
            ------
            ValueError
                char not found in any keyboard layouts

            """
            if char in lowercase:
                return keyboard[0]
            elif char in uppercase:
                return keyboard[1]
            raise ValueError(char + ' not found in any keyboard layouts')

        def _substitution_cost(char1, char2):
            cost = sub_cost
            cost *= metric_dict[metric](char1, char2) + shift_cost * (
                _kb_array_for_char(char1) != _kb_array_for_char(char2)
            )
            return cost

        def _get_char_coord(char, kb_array):
            """Return the row & column of char in the keyboard.

            Parameters
            ----------
            char : str
                The character to search for
            kb_array : tuple of tuples
                The array of key positions

            Returns
            -------
            tuple
                The row & column of the key

            """
            for row in kb_array:  # pragma: no branch
                if char in row:
                    return kb_array.index(row), row.index(char)

        def _euclidean_keyboard_distance(char1, char2):
            row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1))
            row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2))
            return ((row1 - row2) ** 2 + (col1 - col2) ** 2) ** 0.5

        def _manhattan_keyboard_distance(char1, char2):
            row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1))
            row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2))
            return abs(row1 - row2) + abs(col1 - col2)

        def _log_euclidean_keyboard_distance(char1, char2):
            return log(1 + _euclidean_keyboard_distance(char1, char2))

        def _log_manhattan_keyboard_distance(char1, char2):
            return log(1 + _manhattan_keyboard_distance(char1, char2))

        metric_dict = {
            'euclidean': _euclidean_keyboard_distance,
            'manhattan': _manhattan_keyboard_distance,
            'log-euclidean': _log_euclidean_keyboard_distance,
            'log-manhattan': _log_manhattan_keyboard_distance,
        }

        d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32)
        for i in range(len(src) + 1):
            d_mat[i, 0] = i * del_cost
        for j in range(len(tar) + 1):
            d_mat[0, j] = j * ins_cost

        for i in range(len(src)):
            for j in range(len(tar)):
                d_mat[i + 1, j + 1] = min(
                    d_mat[i + 1, j] + ins_cost,  # ins
                    d_mat[i, j + 1] + del_cost,  # del
                    d_mat[i, j]
                    + (
                        _substitution_cost(src[i], tar[j])
                        if src[i] != tar[j]
                        else 0
                    ),  # sub/==
                )

        return d_mat[len(src), len(tar)]

    def dist(
        self,
        src,
        tar,
        metric='euclidean',
        cost=(1, 1, 0.5, 0.5),
        layout='QWERTY',
    ):
        """Return the normalized typo distance between two strings.

        This is typo distance, normalized to [0, 1].

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison
        metric : str
            Supported values include: ``euclidean``, ``manhattan``,
            ``log-euclidean``, and ``log-manhattan``
        cost : tuple
            A 4-tuple representing the cost of the four possible edits:
            inserts, deletes, substitutions, and shift, respectively (by
            default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be
            significantly less than the cost of an insertion & deletion unless
            a log metric is used.
        layout : str
            Name of the keyboard layout to use (Currently supported:
            ``QWERTY``, ``Dvorak``, ``AZERTY``, ``QWERTZ``)

        Returns
        -------
        float
            Normalized typo distance

        Examples
        --------
        >>> cmp = Typo()
        >>> round(cmp.dist('cat', 'hat'), 12)
        0.527046283086
        >>> round(cmp.dist('Niall', 'Neil'), 12)
        0.565028142929
        >>> round(cmp.dist('Colin', 'Cuilen'), 12)
        0.569035609563
        >>> cmp.dist('ATCG', 'TAGC')
        0.625

        """
        if src == tar:
            return 0.0
        ins_cost, del_cost = cost[:2]
        return self.dist_abs(src, tar, metric, cost, layout) / (
            max(len(src) * del_cost, len(tar) * ins_cost)
        )


def typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5), layout='QWERTY'):
    """Return the typo distance between two strings.

    This is a wrapper for :py:meth:`Typo.typo`.

    Parameters
    ----------
    src : str
        Source string for comparison
    tar : str
        Target string for comparison
    metric : str
        Supported values include: ``euclidean``, ``manhattan``,
        ``log-euclidean``, and ``log-manhattan``
    cost : tuple
        A 4-tuple representing the cost of the four possible edits: inserts,
        deletes, substitutions, and shift, respectively (by default:
        (1, 1, 0.5, 0.5)) The substitution & shift costs should be
        significantly less than the cost of an insertion & deletion unless a
        log metric is used.
    layout : str
        Name of the keyboard layout to use (Currently supported:
        ``QWERTY``, ``Dvorak``, ``AZERTY``, ``QWERTZ``)

    Returns
    -------
    float
        Typo distance

    Examples
    --------
    >>> typo('cat', 'hat')
    1.5811388
    >>> typo('Niall', 'Neil')
    2.8251407
    >>> typo('Colin', 'Cuilen')
    3.4142137
    >>> typo('ATCG', 'TAGC')
    2.5

    >>> typo('cat', 'hat', metric='manhattan')
    2.0
    >>> typo('Niall', 'Neil', metric='manhattan')
    3.0
    >>> typo('Colin', 'Cuilen', metric='manhattan')
    3.5
    >>> typo('ATCG', 'TAGC', metric='manhattan')
    2.5

    >>> typo('cat', 'hat', metric='log-manhattan')
    0.804719
    >>> typo('Niall', 'Neil', metric='log-manhattan')
    2.2424533
    >>> typo('Colin', 'Cuilen', metric='log-manhattan')
    2.2424533
    >>> typo('ATCG', 'TAGC', metric='log-manhattan')
    2.3465736

    """
    return Typo().dist_abs(src, tar, metric, cost, layout)


def dist_typo(
    src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5), layout='QWERTY'
):
    """Return the normalized typo distance between two strings.

    This is a wrapper for :py:meth:`Typo.dist`.

    Parameters
    ----------
    src : str
        Source string for comparison
    tar : str
        Target string for comparison
    metric : str
        Supported values include: ``euclidean``, ``manhattan``,
        ``log-euclidean``, and ``log-manhattan``
    cost : tuple
        A 4-tuple representing the cost of the four possible edits: inserts,
        deletes, substitutions, and shift, respectively (by default:
        (1, 1, 0.5, 0.5)) The substitution & shift costs should be
        significantly less than the cost of an insertion & deletion unless a
        log metric is used.
    layout : str
        Name of the keyboard layout to use (Currently supported:
        ``QWERTY``, ``Dvorak``, ``AZERTY``, ``QWERTZ``)

    Returns
    -------
    float
        Normalized typo distance

    Examples
    --------
    >>> round(dist_typo('cat', 'hat'), 12)
    0.527046283086
    >>> round(dist_typo('Niall', 'Neil'), 12)
    0.565028142929
    >>> round(dist_typo('Colin', 'Cuilen'), 12)
    0.569035609563
    >>> dist_typo('ATCG', 'TAGC')
    0.625

    """
    return Typo().dist(src, tar, metric, cost, layout)


def sim_typo(
    src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5), layout='QWERTY'
):
    """Return the normalized typo similarity between two strings.

    This is a wrapper for :py:meth:`Typo.sim`.

    Parameters
    ----------
    src : str
        Source string for comparison
    tar : str
        Target string for comparison
    metric : str
        Supported values include: ``euclidean``, ``manhattan``,
        ``log-euclidean``, and ``log-manhattan``
    cost : tuple
        A 4-tuple representing the cost of the four possible edits: inserts,
        deletes, substitutions, and shift, respectively (by default:
        (1, 1, 0.5, 0.5)) The substitution & shift costs should be
        significantly less than the cost of an insertion & deletion unless a
        log metric is used.
    layout : str
        Name of the keyboard layout to use (Currently supported:
        ``QWERTY``, ``Dvorak``, ``AZERTY``, ``QWERTZ``)

    Returns
    -------
    float
        Normalized typo similarity

    Examples
    --------
    >>> round(sim_typo('cat', 'hat'), 12)
    0.472953716914
    >>> round(sim_typo('Niall', 'Neil'), 12)
    0.434971857071
    >>> round(sim_typo('Colin', 'Cuilen'), 12)
    0.430964390437
    >>> sim_typo('ATCG', 'TAGC')
    0.375

    """
    return Typo().sim(src, tar, metric, cost, layout)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.distance._typo.
20
21		Typo edit distance functions.
22		"""
23
24	1	from __future__ import (
25		absolute_import,
26		division,
27		print_function,
28		unicode_literals,
29		)
30
31	1	from math import log
32
33	1	from numpy import float32 as np_float32
34	1	from numpy import zeros as np_zeros
35
36	1	from six.moves import range
37
38	1	from ._distance import _Distance
39
40	1	__all__ = ['Typo', 'dist_typo', 'sim_typo', 'typo']
41
42
43	1	class Typo(_Distance):
44		"""Typo distance.
45
46		This is inspired by Typo-Distance :cite:`Song:2011`, and a fair bit of
47		this was copied from that module. Compared to the original, this supports
48		different metrics for substitution.
49		"""
50
51		# fmt: off
52	1	_keyboard = {'QWERTY': (
53		(('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '-', '='),
54		('', 'q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '[', ']',
55		'\\'),
56		('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', ';', '\''),
57		('', 'z', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '/')),
58		(('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+'),
59		('', 'Q', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '{', '}', '\|'),
60		('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', ':', '"'),
61		('', 'Z', 'X', 'C', 'V', 'B', 'N', 'M', '<', '>', '?'))
62		), 'Dvorak': (
63		(('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '[', ']'),
64		('', '\'', ',', '.', 'p', 'y', 'f', 'g', 'c', 'r', 'l', '/', '=',
65		'\\'),
66		('', 'a', 'o', 'e', 'u', 'i', 'd', 'h', 't', 'n', 's', '-'),
67		('', ';', 'q', 'j', 'k', 'x', 'b', 'm', 'w', 'v', 'z')),
68		(('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '{', '}'),
69		('', '"', '<', '>', 'P', 'Y', 'F', 'G', 'C', 'R', 'L', '?', '+', '\|'),
70		('', 'A', 'O', 'E', 'U', 'I', 'D', 'H', 'T', 'N', 'S', '_'),
71		('', ':', 'Q', 'J', 'K', 'X', 'B', 'M', 'W', 'V', 'Z'))
72		), 'AZERTY': (
73		(('²', '&', 'é', '"', '\'', '(', '-', 'è', '_', 'ç', 'à', ')', '='),
74		('', 'a', 'z', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '', '$'),
75		('', 'q', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'ù', '*'),
76		('<', 'w', 'x', 'c', 'v', 'b', 'n', ',', ';', ':', '!')),
77		(('~', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '°', '+'),
78		('', 'A', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '', '£'),
79		('', 'Q', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'Ù', 'μ'),
80		('>', 'W', 'X', 'C', 'V', 'B', 'N', '?', '.', '/', '§'))
81		), 'QWERTZ': (
82		(('', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'ß', ''),
83		('', 'q', 'w', 'e', 'r', 't', 'z', 'u', 'i', 'o', 'p', ' ü', '+',
84		'\\'),
85		('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'ö', 'ä', '#'),
86		('<', 'y', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '-')),
87		(('°', '!', '"', '§', '$', '%', '&', '/', '(', ')', '=', '?', ''),
88		('', 'Q', 'W', 'E', 'R', 'T', 'Z', 'U', 'I', 'O', 'P', 'Ü', '*', ''),
89		('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Ö', 'Ä', '\''),
90		('>', 'Y', 'X', 'C', 'V', 'B', 'N', 'M', ';', ':', '_'))
91		)}
92		# fmt: on
93
94	1	def dist_abs(
95		self,
96		src,
97		tar,
98		metric='euclidean',
99		cost=(1, 1, 0.5, 0.5),
100		layout='QWERTY',
101		):
102		"""Return the typo distance between two strings.
103
104		Parameters
105		----------
106		src : str
107		Source string for comparison
108		tar : str
109		Target string for comparison
110		metric : str
111		Supported values include: ``euclidean``, ``manhattan``,
112		``log-euclidean``, and ``log-manhattan``
113		cost : tuple
114		A 4-tuple representing the cost of the four possible edits:
115		inserts, deletes, substitutions, and shift, respectively (by
116		default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be
117		significantly less than the cost of an insertion & deletion unless
118		a log metric is used.
119		layout : str
120		Name of the keyboard layout to use (Currently supported:
121		``QWERTY``, ``Dvorak``, ``AZERTY``, ``QWERTZ``)
122
123		Returns
124		-------
125		float
126		Typo distance
127
128		Raises
129		------
130		ValueError
131		char not found in any keyboard layouts
132
133		Examples
134		--------
135		>>> cmp = Typo()
136		>>> cmp.dist_abs('cat', 'hat')
137		1.5811388
138		>>> cmp.dist_abs('Niall', 'Neil')
139		2.8251407
140		>>> cmp.dist_abs('Colin', 'Cuilen')
141		3.4142137
142		>>> cmp.dist_abs('ATCG', 'TAGC')
143		2.5
144
145		>>> cmp.dist_abs('cat', 'hat', metric='manhattan')
146		2.0
147		>>> cmp.dist_abs('Niall', 'Neil', metric='manhattan')
148		3.0
149		>>> cmp.dist_abs('Colin', 'Cuilen', metric='manhattan')
150		3.5
151		>>> cmp.dist_abs('ATCG', 'TAGC', metric='manhattan')
152		2.5
153
154		>>> cmp.dist_abs('cat', 'hat', metric='log-manhattan')
155		0.804719
156		>>> cmp.dist_abs('Niall', 'Neil', metric='log-manhattan')
157		2.2424533
158		>>> cmp.dist_abs('Colin', 'Cuilen', metric='log-manhattan')
159		2.2424533
160		>>> cmp.dist_abs('ATCG', 'TAGC', metric='log-manhattan')
161		2.3465736
162
163		"""
164	1	ins_cost, del_cost, sub_cost, shift_cost = cost
165
166	1	if src == tar:
167	1	return 0.0
168	1	if not src:
169	1	return len(tar) * ins_cost
170	1	if not tar:
171	1	return len(src) * del_cost
172
173	1	keyboard = self._keyboard[layout]
174	1	lowercase = {item for sublist in keyboard[0] for item in sublist}
175	1	uppercase = {item for sublist in keyboard[1] for item in sublist}
176
177	1	def _kb_array_for_char(char):
178		"""Return the keyboard layout that contains ch.
179
180		Parameters
181		----------
182		char : str
183		The character to lookup
184
185		Returns
186		-------
187		tuple
188		A keyboard
189
190		Raises
191		------
192		ValueError
193		char not found in any keyboard layouts
194
195		"""
196	1	if char in lowercase:
197	1	return keyboard[0]
198	1	elif char in uppercase:
199	1	return keyboard[1]
200	1	raise ValueError(char + ' not found in any keyboard layouts')
201
202	1	def _substitution_cost(char1, char2):
203	1	cost = sub_cost
204	1	cost = metric_dict[metric](char1, char2) + shift_cost (
205		_kb_array_for_char(char1) != _kb_array_for_char(char2)
206		)
207	1	return cost
208
209	1	def _get_char_coord(char, kb_array):
210		"""Return the row & column of char in the keyboard.
211
212		Parameters
213		----------
214		char : str
215		The character to search for
216		kb_array : tuple of tuples
217		The array of key positions
218
219		Returns
220		-------
221		tuple
222		The row & column of the key
223
224		"""
225	1	for row in kb_array: # pragma: no branch
226	1	if char in row:
227	1	return kb_array.index(row), row.index(char)
228
229	1	def _euclidean_keyboard_distance(char1, char2):
230	1	row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1))
231	1	row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2))
232	1	return ((row1 - row2) 2 + (col1 - col2) 2) ** 0.5
233
234	1	def _manhattan_keyboard_distance(char1, char2):
235	1	row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1))
236	1	row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2))
237	1	return abs(row1 - row2) + abs(col1 - col2)
238
239	1	def _log_euclidean_keyboard_distance(char1, char2):
240	1	return log(1 + _euclidean_keyboard_distance(char1, char2))
241
242	1	def _log_manhattan_keyboard_distance(char1, char2):
243	1	return log(1 + _manhattan_keyboard_distance(char1, char2))
244
245	1	metric_dict = {
246		'euclidean': _euclidean_keyboard_distance,
247		'manhattan': _manhattan_keyboard_distance,
248		'log-euclidean': _log_euclidean_keyboard_distance,
249		'log-manhattan': _log_manhattan_keyboard_distance,
250		}
251
252	1	d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32)
253	1	for i in range(len(src) + 1):
254	1	d_mat[i, 0] = i * del_cost
255	1	for j in range(len(tar) + 1):
256	1	d_mat[0, j] = j * ins_cost
257
258	1	for i in range(len(src)):
259	1	for j in range(len(tar)):
260	1	d_mat[i + 1, j + 1] = min(
261		d_mat[i + 1, j] + ins_cost, # ins
262		d_mat[i, j + 1] + del_cost, # del
263		d_mat[i, j]
264		+ (
265		_substitution_cost(src[i], tar[j])
266		if src[i] != tar[j]
267		else 0
268		), # sub/==
269		)
270
271	1	return d_mat[len(src), len(tar)]
272
273	1	def dist(
274		self,
275		src,
276		tar,
277		metric='euclidean',
278		cost=(1, 1, 0.5, 0.5),
279		layout='QWERTY',
280		):
281		"""Return the normalized typo distance between two strings.
282
283		This is typo distance, normalized to [0, 1].
284
285		Parameters
286		----------
287		src : str
288		Source string for comparison
289		tar : str
290		Target string for comparison
291		metric : str
292		Supported values include: ``euclidean``, ``manhattan``,
293		``log-euclidean``, and ``log-manhattan``
294		cost : tuple
295		A 4-tuple representing the cost of the four possible edits:
296		inserts, deletes, substitutions, and shift, respectively (by
297		default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be
298		significantly less than the cost of an insertion & deletion unless
299		a log metric is used.
300		layout : str
301		Name of the keyboard layout to use (Currently supported:
302		``QWERTY``, ``Dvorak``, ``AZERTY``, ``QWERTZ``)
303
304		Returns
305		-------
306		float
307		Normalized typo distance
308
309		Examples
310		--------
311		>>> cmp = Typo()
312		>>> round(cmp.dist('cat', 'hat'), 12)
313		0.527046283086
314		>>> round(cmp.dist('Niall', 'Neil'), 12)
315		0.565028142929
316		>>> round(cmp.dist('Colin', 'Cuilen'), 12)
317		0.569035609563
318		>>> cmp.dist('ATCG', 'TAGC')
319		0.625
320
321		"""
322	1	if src == tar:
323	1	return 0.0
324	1	ins_cost, del_cost = cost[:2]
325	1	return self.dist_abs(src, tar, metric, cost, layout) / (
326		max(len(src) * del_cost, len(tar) * ins_cost)
327		)
328
329
330	1	def typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5), layout='QWERTY'):
331		"""Return the typo distance between two strings.
332
333		This is a wrapper for :py:meth:`Typo.typo`.
334
335		Parameters
336		----------
337		src : str
338		Source string for comparison
339		tar : str
340		Target string for comparison
341		metric : str
342		Supported values include: ``euclidean``, ``manhattan``,
343		``log-euclidean``, and ``log-manhattan``
344		cost : tuple
345		A 4-tuple representing the cost of the four possible edits: inserts,
346		deletes, substitutions, and shift, respectively (by default:
347		(1, 1, 0.5, 0.5)) The substitution & shift costs should be
348		significantly less than the cost of an insertion & deletion unless a
349		log metric is used.
350		layout : str
351		Name of the keyboard layout to use (Currently supported:
352		``QWERTY``, ``Dvorak``, ``AZERTY``, ``QWERTZ``)
353
354		Returns
355		-------
356		float
357		Typo distance
358
359		Examples
360		--------
361		>>> typo('cat', 'hat')
362		1.5811388
363		>>> typo('Niall', 'Neil')
364		2.8251407
365		>>> typo('Colin', 'Cuilen')
366		3.4142137
367		>>> typo('ATCG', 'TAGC')
368		2.5
369
370		>>> typo('cat', 'hat', metric='manhattan')
371		2.0
372		>>> typo('Niall', 'Neil', metric='manhattan')
373		3.0
374		>>> typo('Colin', 'Cuilen', metric='manhattan')
375		3.5
376		>>> typo('ATCG', 'TAGC', metric='manhattan')
377		2.5
378
379		>>> typo('cat', 'hat', metric='log-manhattan')
380		0.804719
381		>>> typo('Niall', 'Neil', metric='log-manhattan')
382		2.2424533
383		>>> typo('Colin', 'Cuilen', metric='log-manhattan')
384		2.2424533
385		>>> typo('ATCG', 'TAGC', metric='log-manhattan')
386		2.3465736
387
388		"""
389	1	return Typo().dist_abs(src, tar, metric, cost, layout)
390
391
392	1	def dist_typo(
393		src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5), layout='QWERTY'
394		):
395		"""Return the normalized typo distance between two strings.
396
397		This is a wrapper for :py:meth:`Typo.dist`.
398
399		Parameters
400		----------
401		src : str
402		Source string for comparison
403		tar : str
404		Target string for comparison
405		metric : str
406		Supported values include: ``euclidean``, ``manhattan``,
407		``log-euclidean``, and ``log-manhattan``
408		cost : tuple
409		A 4-tuple representing the cost of the four possible edits: inserts,
410		deletes, substitutions, and shift, respectively (by default:
411		(1, 1, 0.5, 0.5)) The substitution & shift costs should be
412		significantly less than the cost of an insertion & deletion unless a
413		log metric is used.
414		layout : str
415		Name of the keyboard layout to use (Currently supported:
416		``QWERTY``, ``Dvorak``, ``AZERTY``, ``QWERTZ``)
417
418		Returns
419		-------
420		float
421		Normalized typo distance
422
423		Examples
424		--------
425		>>> round(dist_typo('cat', 'hat'), 12)
426		0.527046283086
427		>>> round(dist_typo('Niall', 'Neil'), 12)
428		0.565028142929
429		>>> round(dist_typo('Colin', 'Cuilen'), 12)
430		0.569035609563
431		>>> dist_typo('ATCG', 'TAGC')
432		0.625
433
434		"""
435	1	return Typo().dist(src, tar, metric, cost, layout)
436
437
438	1	def sim_typo(
439		src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5), layout='QWERTY'
440		):
441		"""Return the normalized typo similarity between two strings.
442
443		This is a wrapper for :py:meth:`Typo.sim`.
444
445		Parameters
446		----------
447		src : str
448		Source string for comparison
449		tar : str
450		Target string for comparison
451		metric : str
452		Supported values include: ``euclidean``, ``manhattan``,
453		``log-euclidean``, and ``log-manhattan``
454		cost : tuple
455		A 4-tuple representing the cost of the four possible edits: inserts,
456		deletes, substitutions, and shift, respectively (by default:
457		(1, 1, 0.5, 0.5)) The substitution & shift costs should be
458		significantly less than the cost of an insertion & deletion unless a
459		log metric is used.
460		layout : str
461		Name of the keyboard layout to use (Currently supported:
462		``QWERTY``, ``Dvorak``, ``AZERTY``, ``QWERTZ``)
463
464		Returns
465		-------
466		float
467		Normalized typo similarity
468
469		Examples
470		--------
471		>>> round(sim_typo('cat', 'hat'), 12)
472		0.472953716914
473		>>> round(sim_typo('Niall', 'Neil'), 12)
474		0.434971857071
475		>>> round(sim_typo('Colin', 'Cuilen'), 12)
476		0.430964390437
477		>>> sim_typo('ATCG', 'TAGC')
478		0.375
479
480		"""
481	1	return Typo().sim(src, tar, metric, cost, layout)
482
483
484		if __name__ == '__main__':
485		import doctest
486
487		doctest.testmod()
488

chrislit / abydos

Push — master ( 3ac297...afe14d )

abydos.distance._typo.Typo.dist_abs() D

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like