PhoneticEditDistance.dist()   A
last analyzed

Complexity

Conditions 2

Size

Total Lines 50
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 7
CRAP Score 2

Importance

Changes 0
Metric Value
eloc 9
dl 0
loc 50
ccs 7
cts 7
cp 1
rs 9.95
c 0
b 0
f 0
cc 2
nop 3
crap 2
1
# Copyright 2019-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.distance._phonetic_edit_distance.
18
19 1
Phonetic edit distance
20
"""
21
22
from typing import (
23
    Any,
24 1
    Callable,
25
    Dict,
26
    Iterable,
27
    List,
28
    Optional,
29
    Sequence,
30
    Tuple,
31 1
    Union,
32
    cast,
33 1
)
34
35 1
import numpy as np
36 1
37
from ._levenshtein import Levenshtein
38 1
from ..phones._phones import _FEATURE_MASK, cmp_features, ipa_to_features
39
40
__all__ = ['PhoneticEditDistance']
41 1
42
43
class PhoneticEditDistance(Levenshtein):
44
    """Phonetic edit distance.
45
46
    This is a variation on Levenshtein edit distance, intended for strings in
47
    IPA, that compares individual phones based on their featural similarity.
48
49
    .. versionadded:: 0.4.1
50 1
    """
51
52
    def __init__(
53
        self,
54
        mode: str = 'lev',
55
        cost: Tuple[float, float, float, float] = (1, 1, 1, 0.33333),
56
        normalizer: Callable[[List[float]], float] = max,
57
        weights: Optional[Union[Iterable[float], Dict[str, float]]] = None,
58
        **kwargs: Any
59
    ):
60
        """Initialize PhoneticEditDistance instance.
61
62
        Parameters
63
        ----------
64
        mode : str
65
            Specifies a mode for computing the edit distance:
66
67
                - ``lev`` (default) computes the ordinary Levenshtein distance,
68
                  in which edits may include inserts, deletes, and
69
                  substitutions
70
                - ``osa`` computes the Optimal String Alignment distance, in
71
                  which edits may include inserts, deletes, substitutions, and
72
                  transpositions but substrings may only be edited once
73
74
        cost : tuple
75
            A 4-tuple representing the cost of the four possible edits:
76
            inserts, deletes, substitutions, and transpositions, respectively
77
            (by default: (1, 1, 1, 0.33333)). Note that transpositions cost a
78
            relatively low 0.33333. If this were 1.0, no phones would ever be
79
            transposed under the normal weighting, since even quite dissimilar
80
            phones such as [a] and [p] still agree in nearly 63% of their
81
            features.
82
        normalizer : function
83
            A function that takes an list and computes a normalization term
84
            by which the edit distance is divided (max by default). Another
85
            good option is the sum function.
86
        weights : None or list or tuple or dict
87
            If None, all features are of equal significance and a simple
88
            normalized hamming distance of the features is calculated. If a
89
            list or tuple of numeric values is supplied, the values are
90
            inferred as the weights for each feature, in order of the features
91
            listed in abydos.phones._phones._FEATURE_MASK. If a dict is
92
            supplied, its key values should match keys in
93
            abydos.phones._phones._FEATURE_MASK to which each weight (value)
94
            should be assigned. Missing values in all cases are assigned a
95
            weight of 0 and will be omitted from the comparison.
96
        **kwargs
97
            Arbitrary keyword arguments
98
99
100
        .. versionadded:: 0.4.1
101 1
102 1
        """
103 1
        super(PhoneticEditDistance, self).__init__(**kwargs)
104 1
        self._mode = mode
105
        self._cost = cost
106 1
        self._normalizer = normalizer
107 1
108
        if isinstance(weights, dict):
109
            weights = [
110
                weights[feature] if feature in weights else 0
111
                for feature in sorted(
112
                    _FEATURE_MASK, key=_FEATURE_MASK.get, reverse=True
113 1
                )
114 1
            ]
115 1
        elif isinstance(weights, (list, tuple)):
116
            weights = list(weights) + [0] * (len(_FEATURE_MASK) - len(weights))
117 1
        self._weights = weights
118
119
    def _alignment_matrix(
120
        self, src: str, tar: str, backtrace: bool = True
121
    ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
122
        """Return the phonetic edit distance alignment matrix.
123
124
        Parameters
125
        ----------
126
        src : str
127
            Source string for comparison
128
        tar : str
129
            Target string for comparison
130
        backtrace : bool
131
            Return the backtrace matrix as well
132
133
        Returns
134
        -------
135
        numpy.ndarray or tuple(numpy.ndarray, numpy.ndarray)
136
            The alignment matrix and (optionally) the backtrace matrix
137
138 1
139
        .. versionadded:: 0.4.1
140 1
141 1
        """
142
        ins_cost, del_cost, sub_cost, trans_cost = self._cost
143 1
144 1
        src_len = len(src)
145
        tar_len = len(tar)
146 1
147 1
        src_list = ipa_to_features(src)
148 1
        tar_list = ipa_to_features(tar)
149 1
150 1
        d_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.float_)
151 1
        if backtrace:
152 1
            trace_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.int8)
153 1
        for i in range(1, src_len + 1):
154 1
            d_mat[i, 0] = i * del_cost
155 1
            if backtrace:
156 1
                trace_mat[i, 0] = 0
157
        for j in range(1, tar_len + 1):
158 1
            d_mat[0, j] = j * ins_cost
159 1
            if backtrace:
160 1
                trace_mat[0, j] = 1
161 1
162
        for i in range(src_len):
163
            for j in range(tar_len):
164
                traces = ((i + 1, j), (i, j + 1), (i, j))
165
                opts = (
166
                    d_mat[traces[0]] + ins_cost,  # ins
167
                    d_mat[traces[1]] + del_cost,  # del
168
                    d_mat[traces[2]]
169
                    + (
170
                        sub_cost
171
                        * (
172 1
                            1.0
173 1
                            - cmp_features(
174 1
                                src_list[i],
175
                                tar_list[j],
176 1
                                cast(Sequence[float], self._weights),
177 1
                            )
178
                        )
179
                        if src_list[i] != tar_list[j]
180
                        else 0
181
                    ),  # sub/==
182
                )
183
                d_mat[i + 1, j + 1] = min(opts)
184 1
                if backtrace:
185
                    trace_mat[i + 1, j + 1] = int(np.argmin(opts))
186
187
                if self._mode == 'osa':
188 1
                    if (
189 1
                        i + 1 > 1
190 1
                        and j + 1 > 1
191 1
                        and src_list[i] == tar_list[j - 1]
192 1
                        and src_list[i - 1] == tar_list[j]
193
                    ):
194 1
                        # transposition
195
                        d_mat[i + 1, j + 1] = min(
196
                            d_mat[i + 1, j + 1],
197
                            d_mat[i - 1, j - 1] + trans_cost,
198
                        )
199
                        if backtrace:
200
                            trace_mat[i + 1, j + 1] = 2
201
        if backtrace:
202
            return d_mat, trace_mat
0 ignored issues
show
introduced by
The variable trace_mat does not seem to be defined in case backtrace on line 151 is False. Are you sure this can never be the case?
Loading history...
203
        return d_mat
204
205
    def dist_abs(self, src: str, tar: str) -> float:
206
        """Return the phonetic edit distance between two strings.
207
208
        Parameters
209
        ----------
210
        src : str
211
            Source string for comparison
212
        tar : str
213
            Target string for comparison
214
215
        Returns
216
        -------
217
        int (may return a float if cost has float values)
218
            The phonetic edit distance between src & tar
219
220
        Examples
221
        --------
222
        >>> cmp = PhoneticEditDistance()
223
        >>> cmp.dist_abs('cat', 'hat')
224
        0.17741935483870974
225
        >>> cmp.dist_abs('Niall', 'Neil')
226
        1.161290322580645
227
        >>> cmp.dist_abs('aluminum', 'Catalan')
228
        2.467741935483871
229
        >>> cmp.dist_abs('ATCG', 'TAGC')
230
        1.193548387096774
231 1
232
        >>> cmp = PhoneticEditDistance(mode='osa')
233 1
        >>> cmp.dist_abs('ATCG', 'TAGC')
234 1
        0.46236225806451603
235
        >>> cmp.dist_abs('ACTG', 'TAGC')
236 1
        1.2580645161290323
237 1
238 1
239 1
        .. versionadded:: 0.4.1
240 1
241 1
        """
242
        ins_cost, del_cost, sub_cost, trans_cost = self._cost
243 1
244
        src_len = len(src)
245 1
        tar_len = len(tar)
246 1
247
        if src == tar:
248 1
            return 0
249
        if not src:
250 1
            return ins_cost * tar_len
251
        if not tar:
252
            return del_cost * src_len
253
254
        d_mat = cast(
255
            np.ndarray, self._alignment_matrix(src, tar, backtrace=False)
256
        )
257
258
        if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]:
259
            return int(d_mat[src_len, tar_len])
260
        else:
261
            return cast(float, d_mat[src_len, tar_len])
262
263
    def dist(self, src: str, tar: str) -> float:
264
        """Return the normalized phonetic edit distance between two strings.
265
266
        The edit distance is normalized by dividing the edit distance
267
        (calculated by either of the two supported methods) by the
268
        greater of the number of characters in src times the cost of a delete
269
        and the number of characters in tar times the cost of an insert.
270
        For the case in which all operations have :math:`cost = 1`, this is
271
        equivalent to the greater of the length of the two strings src & tar.
272
273
        Parameters
274
        ----------
275
        src : str
276
            Source string for comparison
277
        tar : str
278
            Target string for comparison
279
280
        Returns
281
        -------
282
        float
283
            The normalized Levenshtein distance between src & tar
284
285
        Examples
286
        --------
287
        >>> cmp = PhoneticEditDistance()
288 1
        >>> round(cmp.dist('cat', 'hat'), 12)
289 1
        0.059139784946
290 1
        >>> round(cmp.dist('Niall', 'Neil'), 12)
291
        0.232258064516
292 1
        >>> cmp.dist('aluminum', 'Catalan')
293 1
        0.3084677419354839
294
        >>> cmp.dist('ATCG', 'TAGC')
295 1
        0.2983870967741935
296
297
298
        .. versionadded:: 0.4.1
299 1
300
        """
301
        if src == tar:
302
            return 0.0
303
        ins_cost, del_cost = self._cost[:2]
304
305
        src_len = len(src)
306
        tar_len = len(tar)
307
308
        normalize_term = self._normalizer(
309
            [src_len * del_cost, tar_len * ins_cost]
310
        )
311
312
        return self.dist_abs(src, tar) / normalize_term
313
314
315
if __name__ == '__main__':
316
    import doctest
317
318
    doctest.testmod()
319