Completed
Push — master ( 643512...2b6b3e )
by Chris
20:40 queued 10:36
created

abydos.distance._phonetic_edit_distance   A

Complexity

Total Complexity 28

Size/Duplication

Total Lines 306
Duplicated Lines 4.58 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
wmc 28
eloc 103
dl 14
loc 306
ccs 72
cts 72
cp 1
rs 10
c 0
b 0
f 0

4 Methods

Rating   Name   Duplication   Size   Complexity  
F PhoneticEditDistance._alignment_matrix() 14 76 17
A PhoneticEditDistance.dist_abs() 0 55 5
A PhoneticEditDistance.__init__() 0 66 4
A PhoneticEditDistance.dist() 0 50 2

How to fix   Duplicated Code   

Duplicated Code

Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.

Common duplication problems, and corresponding solutions are:

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2019 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._phonetic_edit_distance.
20
21
Phonetic edit distance
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
import numpy as np
32
33 1
from six.moves import range
34
35 1
from ._levenshtein import Levenshtein
36 1
from ..phones._phones import _FEATURE_MASK, cmp_features, ipa_to_features
37
38 1
__all__ = ['PhoneticEditDistance']
39
40
41 1
class PhoneticEditDistance(Levenshtein):
42
    """Phonetic edit distance.
43
44
    This is a variation on Levenshtein edit distance, intended for strings in
45
    IPA, that compares individual phones based on their featural similarity.
46
47
    .. versionadded:: 0.4.1
48
    """
49
50 1
    def __init__(
51
        self,
52
        mode='lev',
53
        cost=(1, 1, 1, 0.33333),
54
        normalizer=max,
55
        weights=None,
56
        **kwargs
57
    ):
58
        """Initialize PhoneticEditDistance instance.
59
60
        Parameters
61
        ----------
62
        mode : str
63
            Specifies a mode for computing the edit distance:
64
65
                - ``lev`` (default) computes the ordinary Levenshtein distance,
66
                  in which edits may include inserts, deletes, and
67
                  substitutions
68
                - ``osa`` computes the Optimal String Alignment distance, in
69
                  which edits may include inserts, deletes, substitutions, and
70
                  transpositions but substrings may only be edited once
71
72
        cost : tuple
73
            A 4-tuple representing the cost of the four possible edits:
74
            inserts, deletes, substitutions, and transpositions, respectively
75
            (by default: (1, 1, 1, 0.33333)). Note that transpositions cost a
76
            relatively low 0.33333. If this were 1.0, no phones would ever be
77
            transposed under the normal weighting, since even quite dissimilar
78
            phones such as [a] and [p] still agree in nearly 63% of their
79
            features.
80
        normalizer : function
81
            A function that takes an list and computes a normalization term
82
            by which the edit distance is divided (max by default). Another
83
            good option is the sum function.
84
        weights : None or list or tuple or dict
85
            If None, all features are of equal significance and a simple
86
            normalized hamming distance of the features is calculated. If a
87
            list or tuple of numeric values is supplied, the values are
88
            inferred as the weights for each feature, in order of the features
89
            listed in abydos.phones._phones._FEATURE_MASK. If a dict is
90
            supplied, its key values should match keys in
91
            abydos.phones._phones._FEATURE_MASK to which each weight (value)
92
            should be assigned. Missing values in all cases are assigned a
93
            weight of 0 and will be omitted from the comparison.
94
        **kwargs
95
            Arbitrary keyword arguments
96
97
98
        .. versionadded:: 0.4.1
99
100
        """
101 1
        super(PhoneticEditDistance, self).__init__(**kwargs)
102 1
        self._mode = mode
103 1
        self._cost = cost
104 1
        self._normalizer = normalizer
105
106 1
        if isinstance(weights, dict):
107 1
            weights = [
108
                weights[feature] if feature in weights else 0
109
                for feature in sorted(
110
                    _FEATURE_MASK, key=_FEATURE_MASK.get, reverse=True
111
                )
112
            ]
113 1
        elif isinstance(weights, (list, tuple)):
114 1
            weights = list(weights) + [0] * (len(_FEATURE_MASK) - len(weights))
115 1
        self._weights = weights
116
117 1
    def _alignment_matrix(self, src, tar, backtrace=True):
118
        """Return the phonetic edit distance alignment matrix.
119
120
        Parameters
121
        ----------
122
        src : str
123
            Source string for comparison
124
        tar : str
125
            Target string for comparison
126
        backtrace : bool
127
            Return the backtrace matrix as well
128
129
        Returns
130
        -------
131
        numpy.ndarray or tuple(numpy.ndarray, numpy.ndarray)
132
            The alignment matrix and (optionally) the backtrace matrix
133
134
135
        .. versionadded:: 0.4.1
136
137
        """
138 1
        ins_cost, del_cost, sub_cost, trans_cost = self._cost
139
140 1
        src_len = len(src)
141 1
        tar_len = len(tar)
142
143 1
        src = ipa_to_features(src)
144 1
        tar = ipa_to_features(tar)
145
146 1
        d_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.float)
147 1
        if backtrace:
148 1
            trace_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.int8)
149 1
        for i in range(1, src_len + 1):
150 1
            d_mat[i, 0] = i * del_cost
151 1
            if backtrace:
152 1
                trace_mat[i, 0] = 0
153 1
        for j in range(1, tar_len + 1):
154 1
            d_mat[0, j] = j * ins_cost
155 1
            if backtrace:
156 1
                trace_mat[0, j] = 1
157
158 1
        for i in range(src_len):
159 1
            for j in range(tar_len):
160 1
                traces = ((i + 1, j), (i, j + 1), (i, j))
161 1
                opts = (
162
                    d_mat[traces[0]] + ins_cost,  # ins
163
                    d_mat[traces[1]] + del_cost,  # del
164
                    d_mat[traces[2]]
165
                    + (
166
                        sub_cost
167
                        * (1.0 - cmp_features(src[i], tar[j], self._weights))
168
                        if src[i] != tar[j]
169
                        else 0
170
                    ),  # sub/==
171
                )
172 1
                d_mat[i + 1, j + 1] = min(opts)
173 1
                if backtrace:
174 1
                    trace_mat[i + 1, j + 1] = int(np.argmin(opts))
175
176 1 View Code Duplication
                if self._mode == 'osa':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
177 1
                    if (
178
                        i + 1 > 1
179
                        and j + 1 > 1
180
                        and src[i] == tar[j - 1]
181
                        and src[i - 1] == tar[j]
182
                    ):
183
                        # transposition
184 1
                        d_mat[i + 1, j + 1] = min(
185
                            d_mat[i + 1, j + 1],
186
                            d_mat[i - 1, j - 1] + trans_cost,
187
                        )
188 1
                        if backtrace:
189 1
                            trace_mat[i + 1, j + 1] = 2
190 1
        if backtrace:
191 1
            return d_mat, trace_mat
0 ignored issues
show
introduced by
The variable trace_mat does not seem to be defined in case backtrace on line 147 is False. Are you sure this can never be the case?
Loading history...
192 1
        return d_mat
193
194 1
    def dist_abs(self, src, tar):
195
        """Return the phonetic edit distance between two strings.
196
197
        Parameters
198
        ----------
199
        src : str
200
            Source string for comparison
201
        tar : str
202
            Target string for comparison
203
204
        Returns
205
        -------
206
        int (may return a float if cost has float values)
207
            The phonetic edit distance between src & tar
208
209
        Examples
210
        --------
211
        >>> cmp = PhoneticEditDistance()
212
        >>> cmp.dist_abs('cat', 'hat')
213
        0.17741935483870974
214
        >>> cmp.dist_abs('Niall', 'Neil')
215
        1.161290322580645
216
        >>> cmp.dist_abs('aluminum', 'Catalan')
217
        2.467741935483871
218
        >>> cmp.dist_abs('ATCG', 'TAGC')
219
        1.193548387096774
220
221
        >>> cmp = PhoneticEditDistance(mode='osa')
222
        >>> cmp.dist_abs('ATCG', 'TAGC')
223
        0.46236225806451603
224
        >>> cmp.dist_abs('ACTG', 'TAGC')
225
        1.2580645161290323
226
227
228
        .. versionadded:: 0.4.1
229
230
        """
231 1
        ins_cost, del_cost, sub_cost, trans_cost = self._cost
232
233 1
        src_len = len(src)
234 1
        tar_len = len(tar)
235
236 1
        if src == tar:
237 1
            return 0
238 1
        if not src:
239 1
            return ins_cost * tar_len
240 1
        if not tar:
241 1
            return del_cost * src_len
242
243 1
        d_mat = self._alignment_matrix(src, tar, backtrace=False)
244
245 1
        if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]:
246 1
            return int(d_mat[src_len, tar_len])
247
        else:
248 1
            return d_mat[src_len, tar_len]
249
250 1
    def dist(self, src, tar):
251
        """Return the normalized phonetic edit distance between two strings.
252
253
        The edit distance is normalized by dividing the edit distance
254
        (calculated by either of the two supported methods) by the
255
        greater of the number of characters in src times the cost of a delete
256
        and the number of characters in tar times the cost of an insert.
257
        For the case in which all operations have :math:`cost = 1`, this is
258
        equivalent to the greater of the length of the two strings src & tar.
259
260
        Parameters
261
        ----------
262
        src : str
263
            Source string for comparison
264
        tar : str
265
            Target string for comparison
266
267
        Returns
268
        -------
269
        float
270
            The normalized Levenshtein distance between src & tar
271
272
        Examples
273
        --------
274
        >>> cmp = PhoneticEditDistance()
275
        >>> round(cmp.dist('cat', 'hat'), 12)
276
        0.059139784946
277
        >>> round(cmp.dist('Niall', 'Neil'), 12)
278
        0.232258064516
279
        >>> cmp.dist('aluminum', 'Catalan')
280
        0.3084677419354839
281
        >>> cmp.dist('ATCG', 'TAGC')
282
        0.2983870967741935
283
284
285
        .. versionadded:: 0.4.1
286
287
        """
288 1
        if src == tar:
289 1
            return 0.0
290 1
        ins_cost, del_cost = self._cost[:2]
291
292 1
        src_len = len(src)
293 1
        tar_len = len(tar)
294
295 1
        normalize_term = self._normalizer(
296
            [src_len * del_cost, tar_len * ins_cost]
297
        )
298
299 1
        return self.dist_abs(src, tar) / normalize_term
300
301
302
if __name__ == '__main__':
303
    import doctest
304
305
    doctest.testmod()
306