abydos.distance._phonetic_edit_distance - Code Metrics - Inspection of "Merge pull request #240 from chrislit/0.4.1" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 643512...2b6b3e )

by Chris

created 2020-01-06 02:41 UTC

abydos.distance._phonetic_edit_distance A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	306
Duplicated Lines	4.58 %

Test Coverage

Coverage

100%

Importance

Changes

Metric	Value
wmc	28
eloc	103
dl	14
loc	306
ccs	72
cts	72
cp	1
rs	10
c	0
b	0
f	0

4 Methods

Rating	Name	Duplication	Size	Complexity
F	PhoneticEditDistance._alignment_matrix()	14	76	17
A	PhoneticEditDistance.dist_abs()	0	55	5
A	PhoneticEditDistance.__init__()	0	66	4
A	PhoneticEditDistance.dist()	0	50	2

How to fix Duplicated Code

# -*- coding: utf-8 -*-

# Copyright 2019 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._phonetic_edit_distance.

Phonetic edit distance
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

import numpy as np

from six.moves import range

from ._levenshtein import Levenshtein
from ..phones._phones import _FEATURE_MASK, cmp_features, ipa_to_features

__all__ = ['PhoneticEditDistance']


class PhoneticEditDistance(Levenshtein):
    """Phonetic edit distance.

    This is a variation on Levenshtein edit distance, intended for strings in
    IPA, that compares individual phones based on their featural similarity.

    .. versionadded:: 0.4.1
    """

    def __init__(
        self,
        mode='lev',
        cost=(1, 1, 1, 0.33333),
        normalizer=max,
        weights=None,
        **kwargs
    ):
        """Initialize PhoneticEditDistance instance.

        Parameters
        ----------
        mode : str
            Specifies a mode for computing the edit distance:

                - ``lev`` (default) computes the ordinary Levenshtein distance,
                  in which edits may include inserts, deletes, and
                  substitutions
                - ``osa`` computes the Optimal String Alignment distance, in
                  which edits may include inserts, deletes, substitutions, and
                  transpositions but substrings may only be edited once

        cost : tuple
            A 4-tuple representing the cost of the four possible edits:
            inserts, deletes, substitutions, and transpositions, respectively
            (by default: (1, 1, 1, 0.33333)). Note that transpositions cost a
            relatively low 0.33333. If this were 1.0, no phones would ever be
            transposed under the normal weighting, since even quite dissimilar
            phones such as [a] and [p] still agree in nearly 63% of their
            features.
        normalizer : function
            A function that takes an list and computes a normalization term
            by which the edit distance is divided (max by default). Another
            good option is the sum function.
        weights : None or list or tuple or dict
            If None, all features are of equal significance and a simple
            normalized hamming distance of the features is calculated. If a
            list or tuple of numeric values is supplied, the values are
            inferred as the weights for each feature, in order of the features
            listed in abydos.phones._phones._FEATURE_MASK. If a dict is
            supplied, its key values should match keys in
            abydos.phones._phones._FEATURE_MASK to which each weight (value)
            should be assigned. Missing values in all cases are assigned a
            weight of 0 and will be omitted from the comparison.
        **kwargs
            Arbitrary keyword arguments


        .. versionadded:: 0.4.1

        """
        super(PhoneticEditDistance, self).__init__(**kwargs)
        self._mode = mode
        self._cost = cost
        self._normalizer = normalizer

        if isinstance(weights, dict):
            weights = [
                weights[feature] if feature in weights else 0
                for feature in sorted(
                    _FEATURE_MASK, key=_FEATURE_MASK.get, reverse=True
                )
            ]
        elif isinstance(weights, (list, tuple)):
            weights = list(weights) + [0] * (len(_FEATURE_MASK) - len(weights))
        self._weights = weights

    def _alignment_matrix(self, src, tar, backtrace=True):
        """Return the phonetic edit distance alignment matrix.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison
        backtrace : bool
            Return the backtrace matrix as well

        Returns
        -------
        numpy.ndarray or tuple(numpy.ndarray, numpy.ndarray)
            The alignment matrix and (optionally) the backtrace matrix


        .. versionadded:: 0.4.1

        """
        ins_cost, del_cost, sub_cost, trans_cost = self._cost

        src_len = len(src)
        tar_len = len(tar)

        src = ipa_to_features(src)
        tar = ipa_to_features(tar)

        d_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.float)
        if backtrace:
            trace_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.int8)
        for i in range(1, src_len + 1):
            d_mat[i, 0] = i * del_cost
            if backtrace:
                trace_mat[i, 0] = 0
        for j in range(1, tar_len + 1):
            d_mat[0, j] = j * ins_cost
            if backtrace:
                trace_mat[0, j] = 1

        for i in range(src_len):
            for j in range(tar_len):
                traces = ((i + 1, j), (i, j + 1), (i, j))
                opts = (
                    d_mat[traces[0]] + ins_cost,  # ins
                    d_mat[traces[1]] + del_cost,  # del
                    d_mat[traces[2]]
                    + (
                        sub_cost
                        * (1.0 - cmp_features(src[i], tar[j], self._weights))
                        if src[i] != tar[j]
                        else 0
                    ),  # sub/==
                )
                d_mat[i + 1, j + 1] = min(opts)
                if backtrace:
                    trace_mat[i + 1, j + 1] = int(np.argmin(opts))

                if self._mode == 'osa':

                    if (
                        i + 1 > 1
                        and j + 1 > 1
                        and src[i] == tar[j - 1]
                        and src[i - 1] == tar[j]
                    ):
                        # transposition
                        d_mat[i + 1, j + 1] = min(
                            d_mat[i + 1, j + 1],
                            d_mat[i - 1, j - 1] + trans_cost,
                        )
                        if backtrace:
                            trace_mat[i + 1, j + 1] = 2
        if backtrace:
            return d_mat, trace_mat

        return d_mat

    def dist_abs(self, src, tar):
        """Return the phonetic edit distance between two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        int (may return a float if cost has float values)
            The phonetic edit distance between src & tar

        Examples
        --------
        >>> cmp = PhoneticEditDistance()
        >>> cmp.dist_abs('cat', 'hat')
        0.17741935483870974
        >>> cmp.dist_abs('Niall', 'Neil')
        1.161290322580645
        >>> cmp.dist_abs('aluminum', 'Catalan')
        2.467741935483871
        >>> cmp.dist_abs('ATCG', 'TAGC')
        1.193548387096774

        >>> cmp = PhoneticEditDistance(mode='osa')
        >>> cmp.dist_abs('ATCG', 'TAGC')
        0.46236225806451603
        >>> cmp.dist_abs('ACTG', 'TAGC')
        1.2580645161290323


        .. versionadded:: 0.4.1

        """
        ins_cost, del_cost, sub_cost, trans_cost = self._cost

        src_len = len(src)
        tar_len = len(tar)

        if src == tar:
            return 0
        if not src:
            return ins_cost * tar_len
        if not tar:
            return del_cost * src_len

        d_mat = self._alignment_matrix(src, tar, backtrace=False)

        if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]:
            return int(d_mat[src_len, tar_len])
        else:
            return d_mat[src_len, tar_len]

    def dist(self, src, tar):
        """Return the normalized phonetic edit distance between two strings.

        The edit distance is normalized by dividing the edit distance
        (calculated by either of the two supported methods) by the
        greater of the number of characters in src times the cost of a delete
        and the number of characters in tar times the cost of an insert.
        For the case in which all operations have :math:`cost = 1`, this is
        equivalent to the greater of the length of the two strings src & tar.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            The normalized Levenshtein distance between src & tar

        Examples
        --------
        >>> cmp = PhoneticEditDistance()
        >>> round(cmp.dist('cat', 'hat'), 12)
        0.059139784946
        >>> round(cmp.dist('Niall', 'Neil'), 12)
        0.232258064516
        >>> cmp.dist('aluminum', 'Catalan')
        0.3084677419354839
        >>> cmp.dist('ATCG', 'TAGC')
        0.2983870967741935


        .. versionadded:: 0.4.1

        """
        if src == tar:
            return 0.0
        ins_cost, del_cost = self._cost[:2]

        src_len = len(src)
        tar_len = len(tar)

        normalize_term = self._normalizer(
            [src_len * del_cost, tar_len * ins_cost]
        )

        return self.dist_abs(src, tar) / normalize_term


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1			# -- coding: utf-8 --
2
3			# Copyright 2019 by Christopher C. Little.
4			# This file is part of Abydos.
5			#
6			# Abydos is free software: you can redistribute it and/or modify
7			# it under the terms of the GNU General Public License as published by
8			# the Free Software Foundation, either version 3 of the License, or
9			# (at your option) any later version.
10			#
11			# Abydos is distributed in the hope that it will be useful,
12			# but WITHOUT ANY WARRANTY; without even the implied warranty of
13			# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14			# GNU General Public License for more details.
15			#
16			# You should have received a copy of the GNU General Public License
17			# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1		"""abydos.distance._phonetic_edit_distance.
20
21			Phonetic edit distance
22			"""
23
24	1		from __future__ import (
25			absolute_import,
26			division,
27			print_function,
28			unicode_literals,
29			)
30
31	1		import numpy as np
32
33	1		from six.moves import range
34
35	1		from ._levenshtein import Levenshtein
36	1		from ..phones._phones import _FEATURE_MASK, cmp_features, ipa_to_features
37
38	1		__all__ = ['PhoneticEditDistance']
39
40
41	1		class PhoneticEditDistance(Levenshtein):
42			"""Phonetic edit distance.
43
44			This is a variation on Levenshtein edit distance, intended for strings in
45			IPA, that compares individual phones based on their featural similarity.
46
47			.. versionadded:: 0.4.1
48			"""
49
50	1		def __init__(
51			self,
52			mode='lev',
53			cost=(1, 1, 1, 0.33333),
54			normalizer=max,
55			weights=None,
56			**kwargs
57			):
58			"""Initialize PhoneticEditDistance instance.
59
60			Parameters
61			----------
62			mode : str
63			Specifies a mode for computing the edit distance:
64
65			- ``lev`` (default) computes the ordinary Levenshtein distance,
66			in which edits may include inserts, deletes, and
67			substitutions
68			- ``osa`` computes the Optimal String Alignment distance, in
69			which edits may include inserts, deletes, substitutions, and
70			transpositions but substrings may only be edited once
71
72			cost : tuple
73			A 4-tuple representing the cost of the four possible edits:
74			inserts, deletes, substitutions, and transpositions, respectively
75			(by default: (1, 1, 1, 0.33333)). Note that transpositions cost a
76			relatively low 0.33333. If this were 1.0, no phones would ever be
77			transposed under the normal weighting, since even quite dissimilar
78			phones such as [a] and [p] still agree in nearly 63% of their
79			features.
80			normalizer : function
81			A function that takes an list and computes a normalization term
82			by which the edit distance is divided (max by default). Another
83			good option is the sum function.
84			weights : None or list or tuple or dict
85			If None, all features are of equal significance and a simple
86			normalized hamming distance of the features is calculated. If a
87			list or tuple of numeric values is supplied, the values are
88			inferred as the weights for each feature, in order of the features
89			listed in abydos.phones._phones._FEATURE_MASK. If a dict is
90			supplied, its key values should match keys in
91			abydos.phones._phones._FEATURE_MASK to which each weight (value)
92			should be assigned. Missing values in all cases are assigned a
93			weight of 0 and will be omitted from the comparison.
94			**kwargs
95			Arbitrary keyword arguments
96
97
98			.. versionadded:: 0.4.1
99
100			"""
101	1		super(PhoneticEditDistance, self).__init__(**kwargs)
102	1		self._mode = mode
103	1		self._cost = cost
104	1		self._normalizer = normalizer
105
106	1		if isinstance(weights, dict):
107	1		weights = [
108			weights[feature] if feature in weights else 0
109			for feature in sorted(
110			_FEATURE_MASK, key=_FEATURE_MASK.get, reverse=True
111			)
112			]
113	1		elif isinstance(weights, (list, tuple)):
114	1		weights = list(weights) + [0] * (len(_FEATURE_MASK) - len(weights))
115	1		self._weights = weights
116
117	1		def _alignment_matrix(self, src, tar, backtrace=True):
118			"""Return the phonetic edit distance alignment matrix.
119
120			Parameters
121			----------
122			src : str
123			Source string for comparison
124			tar : str
125			Target string for comparison
126			backtrace : bool
127			Return the backtrace matrix as well
128
129			Returns
130			-------
131			numpy.ndarray or tuple(numpy.ndarray, numpy.ndarray)
132			The alignment matrix and (optionally) the backtrace matrix
133
134
135			.. versionadded:: 0.4.1
136
137			"""
138	1		ins_cost, del_cost, sub_cost, trans_cost = self._cost
139
140	1		src_len = len(src)
141	1		tar_len = len(tar)
142
143	1		src = ipa_to_features(src)
144	1		tar = ipa_to_features(tar)
145
146	1		d_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.float)
147	1		if backtrace:
148	1		trace_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.int8)
149	1		for i in range(1, src_len + 1):
150	1		d_mat[i, 0] = i * del_cost
151	1		if backtrace:
152	1		trace_mat[i, 0] = 0
153	1		for j in range(1, tar_len + 1):
154	1		d_mat[0, j] = j * ins_cost
155	1		if backtrace:
156	1		trace_mat[0, j] = 1
157
158	1		for i in range(src_len):
159	1		for j in range(tar_len):
160	1		traces = ((i + 1, j), (i, j + 1), (i, j))
161	1		opts = (
162			d_mat[traces[0]] + ins_cost, # ins
163			d_mat[traces[1]] + del_cost, # del
164			d_mat[traces[2]]
165			+ (
166			sub_cost
167			* (1.0 - cmp_features(src[i], tar[j], self._weights))
168			if src[i] != tar[j]
169			else 0
170			), # sub/==
171			)
172	1		d_mat[i + 1, j + 1] = min(opts)
173	1		if backtrace:
174	1		trace_mat[i + 1, j + 1] = int(np.argmin(opts))
175
176	1	View Code Duplication	if self._mode == 'osa':
			0 ignored issues – show Duplication introduced 2020-01-06 02:51 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
177	1		if (
178			i + 1 > 1
179			and j + 1 > 1
180			and src[i] == tar[j - 1]
181			and src[i - 1] == tar[j]
182			):
183			# transposition
184	1		d_mat[i + 1, j + 1] = min(
185			d_mat[i + 1, j + 1],
186			d_mat[i - 1, j - 1] + trans_cost,
187			)
188	1		if backtrace:
189	1		trace_mat[i + 1, j + 1] = 2
190	1		if backtrace:
191	1		return d_mat, trace_mat
			0 ignored issues – show introduced 2020-01-06 02:51 UTC by Report Bug Copy Issue Report The variable `trace_mat` does not seem to be defined in case `backtrace` on line `147` is `False`. Are you sure this can never be the case? Loading history...
192	1		return d_mat
193
194	1		def dist_abs(self, src, tar):
195			"""Return the phonetic edit distance between two strings.
196
197			Parameters
198			----------
199			src : str
200			Source string for comparison
201			tar : str
202			Target string for comparison
203
204			Returns
205			-------
206			int (may return a float if cost has float values)
207			The phonetic edit distance between src & tar
208
209			Examples
210			--------
211			>>> cmp = PhoneticEditDistance()
212			>>> cmp.dist_abs('cat', 'hat')
213			0.17741935483870974
214			>>> cmp.dist_abs('Niall', 'Neil')
215			1.161290322580645
216			>>> cmp.dist_abs('aluminum', 'Catalan')
217			2.467741935483871
218			>>> cmp.dist_abs('ATCG', 'TAGC')
219			1.193548387096774
220
221			>>> cmp = PhoneticEditDistance(mode='osa')
222			>>> cmp.dist_abs('ATCG', 'TAGC')
223			0.46236225806451603
224			>>> cmp.dist_abs('ACTG', 'TAGC')
225			1.2580645161290323
226
227
228			.. versionadded:: 0.4.1
229
230			"""
231	1		ins_cost, del_cost, sub_cost, trans_cost = self._cost
232
233	1		src_len = len(src)
234	1		tar_len = len(tar)
235
236	1		if src == tar:
237	1		return 0
238	1		if not src:
239	1		return ins_cost * tar_len
240	1		if not tar:
241	1		return del_cost * src_len
242
243	1		d_mat = self._alignment_matrix(src, tar, backtrace=False)
244
245	1		if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]:
246	1		return int(d_mat[src_len, tar_len])
247			else:
248	1		return d_mat[src_len, tar_len]
249
250	1		def dist(self, src, tar):
251			"""Return the normalized phonetic edit distance between two strings.
252
253			The edit distance is normalized by dividing the edit distance
254			(calculated by either of the two supported methods) by the
255			greater of the number of characters in src times the cost of a delete
256			and the number of characters in tar times the cost of an insert.
257			For the case in which all operations have :math:`cost = 1`, this is
258			equivalent to the greater of the length of the two strings src & tar.
259
260			Parameters
261			----------
262			src : str
263			Source string for comparison
264			tar : str
265			Target string for comparison
266
267			Returns
268			-------
269			float
270			The normalized Levenshtein distance between src & tar
271
272			Examples
273			--------
274			>>> cmp = PhoneticEditDistance()
275			>>> round(cmp.dist('cat', 'hat'), 12)
276			0.059139784946
277			>>> round(cmp.dist('Niall', 'Neil'), 12)
278			0.232258064516
279			>>> cmp.dist('aluminum', 'Catalan')
280			0.3084677419354839
281			>>> cmp.dist('ATCG', 'TAGC')
282			0.2983870967741935
283
284
285			.. versionadded:: 0.4.1
286
287			"""
288	1		if src == tar:
289	1		return 0.0
290	1		ins_cost, del_cost = self._cost[:2]
291
292	1		src_len = len(src)
293	1		tar_len = len(tar)
294
295	1		normalize_term = self._normalizer(
296			[src_len * del_cost, tar_len * ins_cost]
297			)
298
299	1		return self.dist_abs(src, tar) / normalize_term
300
301
302			if __name__ == '__main__':
303			import doctest
304
305			doctest.testmod()
306

chrislit / abydos

Push — master ( 643512...2b6b3e )

abydos.distance._phonetic_edit_distance A

Complexity

Size/Duplication

Test Coverage

Importance

4 Methods

How to fix Duplicated Code

Duplicated Code

Duplication Side-by-Side

Filter issues like