Passed
Push — master ( 416c2f...9ec382 )
by Chris
01:03 queued 13s
created

MetaLevenshtein.__init__()   A

Complexity

Conditions 5

Size

Total Lines 50
Code Lines 18

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 7
CRAP Score 5

Importance

Changes 0
Metric Value
eloc 18
dl 0
loc 50
ccs 7
cts 7
cp 1
rs 9.0333
c 0
b 0
f 0
cc 5
nop 6
crap 5
1
# Copyright 2019-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.distance._meta_levenshtein.
18
19 1
Meta-Levenshtein distance
20
"""
21
22
from collections import defaultdict
23
from math import log1p
24 1
from typing import (
25
    Any,
26
    Callable,
27
    DefaultDict,
28
    List,
29
    Optional,
30
    Tuple,
31 1
    cast,
32 1
)
33
34 1
from numpy import float_ as np_float
35 1
from numpy import zeros as np_zeros
36
37 1
from ._distance import _Distance
38 1
from ._jaro_winkler import JaroWinkler
39 1
from ..corpus import UnigramCorpus
40 1
from ..tokenizer import QGrams, WhitespaceTokenizer, _Tokenizer
41
42 1
__all__ = ['MetaLevenshtein']
43
44
45 1
class MetaLevenshtein(_Distance):
46
    r"""Meta-Levenshtein distance.
47
48
    Meta-Levenshtein distance :cite:`Moreau:2008` combines Soft-TFIDF with
49
    Levenshtein alignment.
50
51
    .. versionadded:: 0.4.0
52
    """
53
54 1
    def __init__(
55
        self,
56
        tokenizer: Optional[_Tokenizer] = None,
57
        corpus: Optional[UnigramCorpus] = None,
58
        metric: Optional[_Distance] = None,
59
        normalizer: Callable[[List[float]], float] = max,
60
        **kwargs: Any
61
    ) -> None:
62
        """Initialize MetaLevenshtein instance.
63
64
        Parameters
65
        ----------
66
        tokenizer : _Tokenizer
67
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
68
        corpus : UnigramCorpus
69
            A unigram corpus :py:class:`UnigramCorpus`. If None, a corpus will
70
            be created from the two words when a similarity function is called.
71
        metric : _Distance
72
            A string distance measure class for making soft matches, by default
73
            Jaro-Winkler.
74
        normalizer : function
75
            A function that takes an list and computes a normalization term
76
            by which the edit distance is divided (max by default). Another
77
            good option is the sum function.
78
        **kwargs
79
            Arbitrary keyword arguments
80
81
        Other Parameters
82
        ----------------
83
        qval : int
84
            The length of each q-gram. Using this parameter and tokenizer=None
85
            will cause the instance to use the QGram tokenizer with this
86
            q value.
87
88
89
        .. versionadded:: 0.4.0
90
91
        """
92 1
        super(MetaLevenshtein, self).__init__(**kwargs)
93 1
        self._corpus = corpus
94 1
        self._metric = JaroWinkler() if metric is None else metric
95 1
        self._normalizer = normalizer
96
97 1
        qval = 2 if 'qval' not in self.params else self.params['qval']
98 1
        self.params['tokenizer'] = (
99
            tokenizer
100
            if tokenizer is not None
101
            else WhitespaceTokenizer()
102
            if qval == 0
103
            else QGrams(qval=qval, start_stop='$#', skip=0, scaler=None)
104
        )
105
106 1
    def dist_abs(self, src: str, tar: str) -> float:
107 1
        """Return the Meta-Levenshtein distance of two strings.
108
109 1
        Parameters
110
        ----------
111
        src : str
112
            Source string for comparison
113
        tar : str
114
            Target string for comparison
115
116
        Returns
117
        -------
118
        float
119
            Meta-Levenshtein distance
120
121
        Examples
122
        --------
123
        >>> cmp = MetaLevenshtein()
124
        >>> cmp.dist_abs('cat', 'hat')
125
        0.6155602628882225
126
        >>> cmp.dist_abs('Niall', 'Neil')
127
        2.538900657220556
128
        >>> cmp.dist_abs('aluminum', 'Catalan')
129
        6.940747163450747
130
        >>> cmp.dist_abs('ATCG', 'TAGC')
131
        3.2311205257764453
132
133
134
        .. versionadded:: 0.4.0
135
136
        """
137
        if src == tar:
138
            return 0.0
139
        if not src:
140 1
            return float(len(tar))
141 1
        if not tar:
142 1
            return float(len(src))
143 1
144 1
        src_tok = self.params['tokenizer'].tokenize(src)
145 1
        src_ordered = src_tok.get_list()
146
        src_tok = src_tok.get_counter()
147 1
148 1
        tar_tok = self.params['tokenizer'].tokenize(tar)
149 1
        tar_ordered = tar_tok.get_list()
150
        tar_tok = tar_tok.get_counter()
151 1
152 1
        if self._corpus is None:
153 1
            corpus = UnigramCorpus(word_tokenizer=self.params['tokenizer'])
154
            corpus.add_document(src)
155 1
            corpus.add_document(tar)
156 1
        else:
157 1
            corpus = self._corpus
158 1
159
        dists = defaultdict(float)  # type: DefaultDict[Tuple[str, str], float]
160 1
        s_toks = set(src_tok.keys())
161
        t_toks = set(tar_tok.keys())
162 1
        for s_tok in s_toks:
163 1
            for t_tok in t_toks:
164 1
                dists[(s_tok, t_tok)] = (
165 1
                    self._metric.dist(s_tok, t_tok) if s_tok != t_tok else 0
166 1
                )
167 1
168
        vws_dict = {}
169
        vwt_dict = {}
170
        for token in src_tok.keys():
171 1
            vws_dict[token] = log1p(src_tok[token]) * corpus.idf(token)
172 1
        for token in tar_tok.keys():
173 1
            vwt_dict[token] = log1p(tar_tok[token]) * corpus.idf(token)
174 1
175 1
        def _dist(s_tok: str, t_tok: str) -> float:
176 1
            return dists[(s_tok, t_tok)] * vws_dict[s_tok] * vwt_dict[t_tok]
177
178 1
        d_mat = np_zeros(
179 1
            (len(src_ordered) + 1, len(tar_ordered) + 1), dtype=np_float
180
        )
181 1
        for i in range(len(src_ordered) + 1):
182
            d_mat[i, 0] = i
183
        for j in range(len(tar_ordered) + 1):
184 1
            d_mat[0, j] = j
185 1
186 1
        for i in range(len(src_ordered)):
187 1
            for j in range(len(tar_ordered)):
188
                d_mat[i + 1, j + 1] = min(
189 1
                    d_mat[i + 1, j] + 1,  # ins
190 1
                    d_mat[i, j + 1] + 1,  # del
191 1
                    d_mat[i, j]
192
                    + _dist(src_ordered[i], tar_ordered[j]),  # sub/==
193
                )
194
195
        return cast(float, d_mat[len(src_ordered), len(tar_ordered)])
196
197
    def dist(self, src: str, tar: str) -> float:
198 1
        """Return the normalized Levenshtein distance between two strings.
199
200 1
        The Levenshtein distance is normalized by dividing the Levenshtein
201
        distance (calculated by any of the three supported methods) by the
202
        greater of the number of characters in src times the cost of a delete
203
        and the number of characters in tar times the cost of an insert.
204
        For the case in which all operations have :math:`cost = 1`, this is
205
        equivalent to the greater of the length of the two strings src & tar.
206
207
        Parameters
208
        ----------
209
        src : str
210
            Source string for comparison
211
        tar : str
212
            Target string for comparison
213
214
        Returns
215
        -------
216
        float
217
            The normalized Levenshtein distance between src & tar
218
219
        Examples
220
        --------
221
        >>> cmp = MetaLevenshtein()
222
        >>> round(cmp.dist('cat', 'hat'), 12)
223
        0.205186754296
224
        >>> round(cmp.dist('Niall', 'Neil'), 12)
225
        0.507780131444
226
        >>> cmp.dist('aluminum', 'Catalan')
227
        0.8675933954313434
228
        >>> cmp.dist('ATCG', 'TAGC')
229
        0.8077801314441113
230
231
232
        .. versionadded:: 0.1.0
233
        .. versionchanged:: 0.3.6
234
            Encapsulated in class
235
236
        """
237
        if src == tar:
238
            return 0.0
239
240 1
        return self.dist_abs(src, tar) / (
241 1
            self._normalizer(
242
                [
243 1
                    self.dist_abs(src, ' ' * len(tar)),
244
                    self.dist_abs(src, ' ' * len(src)),
245
                ]
246
            )
247
            if self._corpus
248
            else self._normalizer([len(src), len(tar)])
249
        )
250
251
252
if __name__ == '__main__':
253
    import doctest
254
255
    doctest.testmod()
256