abydos.distance._phonetic_edit_distance.PhoneticEditDistance.dist() - Code Metrics - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

PhoneticEditDistance.dist() A
last analyzed 2020-12-31 20:10 UTC

↳ Parent: abydos.distance._phonetic_edit_distance

Complexity

Conditions

Size

Total Lines	50
Code Lines	9

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	7
CRAP Score	2

Importance

Changes

Metric	Value
eloc	9
dl	0
loc	50
ccs	7
cts	7
cp	1
rs	9.95
c	0
b	0
f	0
cc	2
nop	3
crap	2

# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._phonetic_edit_distance.

Phonetic edit distance
"""

from typing import (
    Any,
    Callable,
    Dict,
    Iterable,
    List,
    Optional,
    Sequence,
    Tuple,
    Union,
    cast,
)

import numpy as np

from ._levenshtein import Levenshtein
from ..phones._phones import _FEATURE_MASK, cmp_features, ipa_to_features

__all__ = ['PhoneticEditDistance']


class PhoneticEditDistance(Levenshtein):
    """Phonetic edit distance.

    This is a variation on Levenshtein edit distance, intended for strings in
    IPA, that compares individual phones based on their featural similarity.

    .. versionadded:: 0.4.1
    """

    def __init__(
        self,
        mode: str = 'lev',
        cost: Tuple[float, float, float, float] = (1, 1, 1, 0.33333),
        normalizer: Callable[[List[float]], float] = max,
        weights: Optional[Union[Iterable[float], Dict[str, float]]] = None,
        **kwargs: Any
    ):
        """Initialize PhoneticEditDistance instance.

        Parameters
        ----------
        mode : str
            Specifies a mode for computing the edit distance:

                - ``lev`` (default) computes the ordinary Levenshtein distance,
                  in which edits may include inserts, deletes, and
                  substitutions
                - ``osa`` computes the Optimal String Alignment distance, in
                  which edits may include inserts, deletes, substitutions, and
                  transpositions but substrings may only be edited once

        cost : tuple
            A 4-tuple representing the cost of the four possible edits:
            inserts, deletes, substitutions, and transpositions, respectively
            (by default: (1, 1, 1, 0.33333)). Note that transpositions cost a
            relatively low 0.33333. If this were 1.0, no phones would ever be
            transposed under the normal weighting, since even quite dissimilar
            phones such as [a] and [p] still agree in nearly 63% of their
            features.
        normalizer : function
            A function that takes an list and computes a normalization term
            by which the edit distance is divided (max by default). Another
            good option is the sum function.
        weights : None or list or tuple or dict
            If None, all features are of equal significance and a simple
            normalized hamming distance of the features is calculated. If a
            list or tuple of numeric values is supplied, the values are
            inferred as the weights for each feature, in order of the features
            listed in abydos.phones._phones._FEATURE_MASK. If a dict is
            supplied, its key values should match keys in
            abydos.phones._phones._FEATURE_MASK to which each weight (value)
            should be assigned. Missing values in all cases are assigned a
            weight of 0 and will be omitted from the comparison.
        **kwargs
            Arbitrary keyword arguments


        .. versionadded:: 0.4.1

        """
        super(PhoneticEditDistance, self).__init__(**kwargs)
        self._mode = mode
        self._cost = cost
        self._normalizer = normalizer

        if isinstance(weights, dict):
            weights = [
                weights[feature] if feature in weights else 0
                for feature in sorted(
                    _FEATURE_MASK, key=_FEATURE_MASK.get, reverse=True
                )
            ]
        elif isinstance(weights, (list, tuple)):
            weights = list(weights) + [0] * (len(_FEATURE_MASK) - len(weights))
        self._weights = weights

    def _alignment_matrix(
        self, src: str, tar: str, backtrace: bool = True
    ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
        """Return the phonetic edit distance alignment matrix.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison
        backtrace : bool
            Return the backtrace matrix as well

        Returns
        -------
        numpy.ndarray or tuple(numpy.ndarray, numpy.ndarray)
            The alignment matrix and (optionally) the backtrace matrix


        .. versionadded:: 0.4.1

        """
        ins_cost, del_cost, sub_cost, trans_cost = self._cost

        src_len = len(src)
        tar_len = len(tar)

        src_list = ipa_to_features(src)
        tar_list = ipa_to_features(tar)

        d_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.float_)
        if backtrace:
            trace_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.int8)
        for i in range(1, src_len + 1):
            d_mat[i, 0] = i * del_cost
            if backtrace:
                trace_mat[i, 0] = 0
        for j in range(1, tar_len + 1):
            d_mat[0, j] = j * ins_cost
            if backtrace:
                trace_mat[0, j] = 1

        for i in range(src_len):
            for j in range(tar_len):
                traces = ((i + 1, j), (i, j + 1), (i, j))
                opts = (
                    d_mat[traces[0]] + ins_cost,  # ins
                    d_mat[traces[1]] + del_cost,  # del
                    d_mat[traces[2]]
                    + (
                        sub_cost
                        * (
                            1.0
                            - cmp_features(
                                src_list[i],
                                tar_list[j],
                                cast(Sequence[float], self._weights),
                            )
                        )
                        if src_list[i] != tar_list[j]
                        else 0
                    ),  # sub/==
                )
                d_mat[i + 1, j + 1] = min(opts)
                if backtrace:
                    trace_mat[i + 1, j + 1] = int(np.argmin(opts))

                if self._mode == 'osa':
                    if (
                        i + 1 > 1
                        and j + 1 > 1
                        and src_list[i] == tar_list[j - 1]
                        and src_list[i - 1] == tar_list[j]
                    ):
                        # transposition
                        d_mat[i + 1, j + 1] = min(
                            d_mat[i + 1, j + 1],
                            d_mat[i - 1, j - 1] + trans_cost,
                        )
                        if backtrace:
                            trace_mat[i + 1, j + 1] = 2
        if backtrace:
            return d_mat, trace_mat

        return d_mat

    def dist_abs(self, src: str, tar: str) -> float:
        """Return the phonetic edit distance between two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        int (may return a float if cost has float values)
            The phonetic edit distance between src & tar

        Examples
        --------
        >>> cmp = PhoneticEditDistance()
        >>> cmp.dist_abs('cat', 'hat')
        0.17741935483870974
        >>> cmp.dist_abs('Niall', 'Neil')
        1.161290322580645
        >>> cmp.dist_abs('aluminum', 'Catalan')
        2.467741935483871
        >>> cmp.dist_abs('ATCG', 'TAGC')
        1.193548387096774

        >>> cmp = PhoneticEditDistance(mode='osa')
        >>> cmp.dist_abs('ATCG', 'TAGC')
        0.46236225806451603
        >>> cmp.dist_abs('ACTG', 'TAGC')
        1.2580645161290323


        .. versionadded:: 0.4.1

        """
        ins_cost, del_cost, sub_cost, trans_cost = self._cost

        src_len = len(src)
        tar_len = len(tar)

        if src == tar:
            return 0
        if not src:
            return ins_cost * tar_len
        if not tar:
            return del_cost * src_len

        d_mat = cast(
            np.ndarray, self._alignment_matrix(src, tar, backtrace=False)
        )

        if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]:
            return int(d_mat[src_len, tar_len])
        else:
            return cast(float, d_mat[src_len, tar_len])

    def dist(self, src: str, tar: str) -> float:
        """Return the normalized phonetic edit distance between two strings.

        The edit distance is normalized by dividing the edit distance
        (calculated by either of the two supported methods) by the
        greater of the number of characters in src times the cost of a delete
        and the number of characters in tar times the cost of an insert.
        For the case in which all operations have :math:`cost = 1`, this is
        equivalent to the greater of the length of the two strings src & tar.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            The normalized Levenshtein distance between src & tar

        Examples
        --------
        >>> cmp = PhoneticEditDistance()
        >>> round(cmp.dist('cat', 'hat'), 12)
        0.059139784946
        >>> round(cmp.dist('Niall', 'Neil'), 12)
        0.232258064516
        >>> cmp.dist('aluminum', 'Catalan')
        0.3084677419354839
        >>> cmp.dist('ATCG', 'TAGC')
        0.2983870967741935


        .. versionadded:: 0.4.1

        """
        if src == tar:
            return 0.0
        ins_cost, del_cost = self._cost[:2]

        src_len = len(src)
        tar_len = len(tar)

        normalize_term = self._normalizer(
            [src_len * del_cost, tar_len * ins_cost]
        )

        return self.dist_abs(src, tar) / normalize_term


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# Copyright 2019-2020 by Christopher C. Little.
2		# This file is part of Abydos.
3		#
4		# Abydos is free software: you can redistribute it and/or modify
5		# it under the terms of the GNU General Public License as published by
6		# the Free Software Foundation, either version 3 of the License, or
7		# (at your option) any later version.
8		#
9		# Abydos is distributed in the hope that it will be useful,
10		# but WITHOUT ANY WARRANTY; without even the implied warranty of
11		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12		# GNU General Public License for more details.
13		#
14		# You should have received a copy of the GNU General Public License
15		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17		"""abydos.distance._phonetic_edit_distance.
18
19	1	Phonetic edit distance
20		"""
21
22		from typing import (
23		Any,
24	1	Callable,
25		Dict,
26		Iterable,
27		List,
28		Optional,
29		Sequence,
30		Tuple,
31	1	Union,
32		cast,
33	1	)
34
35	1	import numpy as np
36	1
37		from ._levenshtein import Levenshtein
38	1	from ..phones._phones import _FEATURE_MASK, cmp_features, ipa_to_features
39
40		__all__ = ['PhoneticEditDistance']
41	1
42
43		class PhoneticEditDistance(Levenshtein):
44		"""Phonetic edit distance.
45
46		This is a variation on Levenshtein edit distance, intended for strings in
47		IPA, that compares individual phones based on their featural similarity.
48
49		.. versionadded:: 0.4.1
50	1	"""
51
52		def __init__(
53		self,
54		mode: str = 'lev',
55		cost: Tuple[float, float, float, float] = (1, 1, 1, 0.33333),
56		normalizer: Callable[[List[float]], float] = max,
57		weights: Optional[Union[Iterable[float], Dict[str, float]]] = None,
58		**kwargs: Any
59		):
60		"""Initialize PhoneticEditDistance instance.
61
62		Parameters
63		----------
64		mode : str
65		Specifies a mode for computing the edit distance:
66
67		- ``lev`` (default) computes the ordinary Levenshtein distance,
68		in which edits may include inserts, deletes, and
69		substitutions
70		- ``osa`` computes the Optimal String Alignment distance, in
71		which edits may include inserts, deletes, substitutions, and
72		transpositions but substrings may only be edited once
73
74		cost : tuple
75		A 4-tuple representing the cost of the four possible edits:
76		inserts, deletes, substitutions, and transpositions, respectively
77		(by default: (1, 1, 1, 0.33333)). Note that transpositions cost a
78		relatively low 0.33333. If this were 1.0, no phones would ever be
79		transposed under the normal weighting, since even quite dissimilar
80		phones such as [a] and [p] still agree in nearly 63% of their
81		features.
82		normalizer : function
83		A function that takes an list and computes a normalization term
84		by which the edit distance is divided (max by default). Another
85		good option is the sum function.
86		weights : None or list or tuple or dict
87		If None, all features are of equal significance and a simple
88		normalized hamming distance of the features is calculated. If a
89		list or tuple of numeric values is supplied, the values are
90		inferred as the weights for each feature, in order of the features
91		listed in abydos.phones._phones._FEATURE_MASK. If a dict is
92		supplied, its key values should match keys in
93		abydos.phones._phones._FEATURE_MASK to which each weight (value)
94		should be assigned. Missing values in all cases are assigned a
95		weight of 0 and will be omitted from the comparison.
96		**kwargs
97		Arbitrary keyword arguments
98
99
100		.. versionadded:: 0.4.1
101	1
102	1	"""
103	1	super(PhoneticEditDistance, self).__init__(**kwargs)
104	1	self._mode = mode
105		self._cost = cost
106	1	self._normalizer = normalizer
107	1
108		if isinstance(weights, dict):
109		weights = [
110		weights[feature] if feature in weights else 0
111		for feature in sorted(
112		_FEATURE_MASK, key=_FEATURE_MASK.get, reverse=True
113	1	)
114	1	]
115	1	elif isinstance(weights, (list, tuple)):
116		weights = list(weights) + [0] * (len(_FEATURE_MASK) - len(weights))
117	1	self._weights = weights
118
119		def _alignment_matrix(
120		self, src: str, tar: str, backtrace: bool = True
121		) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
122		"""Return the phonetic edit distance alignment matrix.
123
124		Parameters
125		----------
126		src : str
127		Source string for comparison
128		tar : str
129		Target string for comparison
130		backtrace : bool
131		Return the backtrace matrix as well
132
133		Returns
134		-------
135		numpy.ndarray or tuple(numpy.ndarray, numpy.ndarray)
136		The alignment matrix and (optionally) the backtrace matrix
137
138	1
139		.. versionadded:: 0.4.1
140	1
141	1	"""
142		ins_cost, del_cost, sub_cost, trans_cost = self._cost
143	1
144	1	src_len = len(src)
145		tar_len = len(tar)
146	1
147	1	src_list = ipa_to_features(src)
148	1	tar_list = ipa_to_features(tar)
149	1
150	1	d_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.float_)
151	1	if backtrace:
152	1	trace_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.int8)
153	1	for i in range(1, src_len + 1):
154	1	d_mat[i, 0] = i * del_cost
155	1	if backtrace:
156	1	trace_mat[i, 0] = 0
157		for j in range(1, tar_len + 1):
158	1	d_mat[0, j] = j * ins_cost
159	1	if backtrace:
160	1	trace_mat[0, j] = 1
161	1
162		for i in range(src_len):
163		for j in range(tar_len):
164		traces = ((i + 1, j), (i, j + 1), (i, j))
165		opts = (
166		d_mat[traces[0]] + ins_cost, # ins
167		d_mat[traces[1]] + del_cost, # del
168		d_mat[traces[2]]
169		+ (
170		sub_cost
171		* (
172	1	1.0
173	1	- cmp_features(
174	1	src_list[i],
175		tar_list[j],
176	1	cast(Sequence[float], self._weights),
177	1	)
178		)
179		if src_list[i] != tar_list[j]
180		else 0
181		), # sub/==
182		)
183		d_mat[i + 1, j + 1] = min(opts)
184	1	if backtrace:
185		trace_mat[i + 1, j + 1] = int(np.argmin(opts))
186
187		if self._mode == 'osa':
188	1	if (
189	1	i + 1 > 1
190	1	and j + 1 > 1
191	1	and src_list[i] == tar_list[j - 1]
192	1	and src_list[i - 1] == tar_list[j]
193		):
194	1	# transposition
195		d_mat[i + 1, j + 1] = min(
196		d_mat[i + 1, j + 1],
197		d_mat[i - 1, j - 1] + trans_cost,
198		)
199		if backtrace:
200		trace_mat[i + 1, j + 1] = 2
201		if backtrace:
202		return d_mat, trace_mat
		0 ignored issues – show introduced 2020-01-06 02:51 UTC by Report Bug Copy Issue Report The variable `trace_mat` does not seem to be defined in case `backtrace` on line `151` is `False`. Are you sure this can never be the case? Loading history...
203		return d_mat
204
205		def dist_abs(self, src: str, tar: str) -> float:
206		"""Return the phonetic edit distance between two strings.
207
208		Parameters
209		----------
210		src : str
211		Source string for comparison
212		tar : str
213		Target string for comparison
214
215		Returns
216		-------
217		int (may return a float if cost has float values)
218		The phonetic edit distance between src & tar
219
220		Examples
221		--------
222		>>> cmp = PhoneticEditDistance()
223		>>> cmp.dist_abs('cat', 'hat')
224		0.17741935483870974
225		>>> cmp.dist_abs('Niall', 'Neil')
226		1.161290322580645
227		>>> cmp.dist_abs('aluminum', 'Catalan')
228		2.467741935483871
229		>>> cmp.dist_abs('ATCG', 'TAGC')
230		1.193548387096774
231	1
232		>>> cmp = PhoneticEditDistance(mode='osa')
233	1	>>> cmp.dist_abs('ATCG', 'TAGC')
234	1	0.46236225806451603
235		>>> cmp.dist_abs('ACTG', 'TAGC')
236	1	1.2580645161290323
237	1
238	1
239	1	.. versionadded:: 0.4.1
240	1
241	1	"""
242		ins_cost, del_cost, sub_cost, trans_cost = self._cost
243	1
244		src_len = len(src)
245	1	tar_len = len(tar)
246	1
247		if src == tar:
248	1	return 0
249		if not src:
250	1	return ins_cost * tar_len
251		if not tar:
252		return del_cost * src_len
253
254		d_mat = cast(
255		np.ndarray, self._alignment_matrix(src, tar, backtrace=False)
256		)
257
258		if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]:
259		return int(d_mat[src_len, tar_len])
260		else:
261		return cast(float, d_mat[src_len, tar_len])
262
263		def dist(self, src: str, tar: str) -> float:
264		"""Return the normalized phonetic edit distance between two strings.
265
266		The edit distance is normalized by dividing the edit distance
267		(calculated by either of the two supported methods) by the
268		greater of the number of characters in src times the cost of a delete
269		and the number of characters in tar times the cost of an insert.
270		For the case in which all operations have :math:`cost = 1`, this is
271		equivalent to the greater of the length of the two strings src & tar.
272
273		Parameters
274		----------
275		src : str
276		Source string for comparison
277		tar : str
278		Target string for comparison
279
280		Returns
281		-------
282		float
283		The normalized Levenshtein distance between src & tar
284
285		Examples
286		--------
287		>>> cmp = PhoneticEditDistance()
288	1	>>> round(cmp.dist('cat', 'hat'), 12)
289	1	0.059139784946
290	1	>>> round(cmp.dist('Niall', 'Neil'), 12)
291		0.232258064516
292	1	>>> cmp.dist('aluminum', 'Catalan')
293	1	0.3084677419354839
294		>>> cmp.dist('ATCG', 'TAGC')
295	1	0.2983870967741935
296
297
298		.. versionadded:: 0.4.1
299	1
300		"""
301		if src == tar:
302		return 0.0
303		ins_cost, del_cost = self._cost[:2]
304
305		src_len = len(src)
306		tar_len = len(tar)
307
308		normalize_term = self._normalizer(
309		[src_len * del_cost, tar_len * ins_cost]
310		)
311
312		return self.dist_abs(src, tar) / normalize_term
313
314
315		if __name__ == '__main__':
316		import doctest
317
318		doctest.testmod()
319

chrislit / abydos

PhoneticEditDistance.dist() A last analyzed 2020-12-31 20:10 UTC

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like

PhoneticEditDistance.dist() A
last analyzed 2020-12-31 20:10 UTC