abydos.distance._meta_levenshtein - Code Metrics - Inspection of "Merge pull request #257 from chrislit/0.6.0b" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 416c2f...9ec382 )

by Chris

created 2020-02-10 23:39 UTC

abydos.distance._meta_levenshtein A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	256
Duplicated Lines	0 %

Test Coverage

Coverage

100%

Importance

Changes

Metric	Value
wmc	22
eloc	97
dl	0
loc	256
ccs	66
cts	66
cp	1
rs	10
c	0
b	0
f	0

3 Methods

Rating	Name	Size	Complexity
A	MetaLevenshtein.__init__()	50	5
F	MetaLevenshtein.dist_abs()	90	14
A	MetaLevenshtein.dist()	52	3

# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._meta_levenshtein.

Meta-Levenshtein distance
"""

from collections import defaultdict
from math import log1p
from typing import (
    Any,
    Callable,
    DefaultDict,
    List,
    Optional,
    Tuple,
    cast,
)

from numpy import float_ as np_float
from numpy import zeros as np_zeros

from ._distance import _Distance
from ._jaro_winkler import JaroWinkler
from ..corpus import UnigramCorpus
from ..tokenizer import QGrams, WhitespaceTokenizer, _Tokenizer

__all__ = ['MetaLevenshtein']


class MetaLevenshtein(_Distance):
    r"""Meta-Levenshtein distance.

    Meta-Levenshtein distance :cite:`Moreau:2008` combines Soft-TFIDF with
    Levenshtein alignment.

    .. versionadded:: 0.4.0
    """

    def __init__(
        self,
        tokenizer: Optional[_Tokenizer] = None,
        corpus: Optional[UnigramCorpus] = None,
        metric: Optional[_Distance] = None,
        normalizer: Callable[[List[float]], float] = max,
        **kwargs: Any
    ) -> None:
        """Initialize MetaLevenshtein instance.

        Parameters
        ----------
        tokenizer : _Tokenizer
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
        corpus : UnigramCorpus
            A unigram corpus :py:class:`UnigramCorpus`. If None, a corpus will
            be created from the two words when a similarity function is called.
        metric : _Distance
            A string distance measure class for making soft matches, by default
            Jaro-Winkler.
        normalizer : function
            A function that takes an list and computes a normalization term
            by which the edit distance is divided (max by default). Another
            good option is the sum function.
        **kwargs
            Arbitrary keyword arguments

        Other Parameters
        ----------------
        qval : int
            The length of each q-gram. Using this parameter and tokenizer=None
            will cause the instance to use the QGram tokenizer with this
            q value.


        .. versionadded:: 0.4.0

        """
        super(MetaLevenshtein, self).__init__(**kwargs)
        self._corpus = corpus
        self._metric = JaroWinkler() if metric is None else metric
        self._normalizer = normalizer

        qval = 2 if 'qval' not in self.params else self.params['qval']
        self.params['tokenizer'] = (
            tokenizer
            if tokenizer is not None
            else WhitespaceTokenizer()
            if qval == 0
            else QGrams(qval=qval, start_stop='$#', skip=0, scaler=None)
        )

    def dist_abs(self, src: str, tar: str) -> float:
        """Return the Meta-Levenshtein distance of two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            Meta-Levenshtein distance

        Examples
        --------
        >>> cmp = MetaLevenshtein()
        >>> cmp.dist_abs('cat', 'hat')
        0.6155602628882225
        >>> cmp.dist_abs('Niall', 'Neil')
        2.538900657220556
        >>> cmp.dist_abs('aluminum', 'Catalan')
        6.940747163450747
        >>> cmp.dist_abs('ATCG', 'TAGC')
        3.2311205257764453


        .. versionadded:: 0.4.0

        """
        if src == tar:
            return 0.0
        if not src:
            return float(len(tar))
        if not tar:
            return float(len(src))

        src_tok = self.params['tokenizer'].tokenize(src)
        src_ordered = src_tok.get_list()
        src_tok = src_tok.get_counter()

        tar_tok = self.params['tokenizer'].tokenize(tar)
        tar_ordered = tar_tok.get_list()
        tar_tok = tar_tok.get_counter()

        if self._corpus is None:
            corpus = UnigramCorpus(word_tokenizer=self.params['tokenizer'])
            corpus.add_document(src)
            corpus.add_document(tar)
        else:
            corpus = self._corpus

        dists = defaultdict(float)  # type: DefaultDict[Tuple[str, str], float]
        s_toks = set(src_tok.keys())
        t_toks = set(tar_tok.keys())
        for s_tok in s_toks:
            for t_tok in t_toks:
                dists[(s_tok, t_tok)] = (
                    self._metric.dist(s_tok, t_tok) if s_tok != t_tok else 0
                )

        vws_dict = {}
        vwt_dict = {}
        for token in src_tok.keys():
            vws_dict[token] = log1p(src_tok[token]) * corpus.idf(token)
        for token in tar_tok.keys():
            vwt_dict[token] = log1p(tar_tok[token]) * corpus.idf(token)

        def _dist(s_tok: str, t_tok: str) -> float:
            return dists[(s_tok, t_tok)] * vws_dict[s_tok] * vwt_dict[t_tok]

        d_mat = np_zeros(
            (len(src_ordered) + 1, len(tar_ordered) + 1), dtype=np_float
        )
        for i in range(len(src_ordered) + 1):
            d_mat[i, 0] = i
        for j in range(len(tar_ordered) + 1):
            d_mat[0, j] = j

        for i in range(len(src_ordered)):
            for j in range(len(tar_ordered)):
                d_mat[i + 1, j + 1] = min(
                    d_mat[i + 1, j] + 1,  # ins
                    d_mat[i, j + 1] + 1,  # del
                    d_mat[i, j]
                    + _dist(src_ordered[i], tar_ordered[j]),  # sub/==
                )

        return cast(float, d_mat[len(src_ordered), len(tar_ordered)])

    def dist(self, src: str, tar: str) -> float:
        """Return the normalized Levenshtein distance between two strings.

        The Levenshtein distance is normalized by dividing the Levenshtein
        distance (calculated by any of the three supported methods) by the
        greater of the number of characters in src times the cost of a delete
        and the number of characters in tar times the cost of an insert.
        For the case in which all operations have :math:`cost = 1`, this is
        equivalent to the greater of the length of the two strings src & tar.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            The normalized Levenshtein distance between src & tar

        Examples
        --------
        >>> cmp = MetaLevenshtein()
        >>> round(cmp.dist('cat', 'hat'), 12)
        0.205186754296
        >>> round(cmp.dist('Niall', 'Neil'), 12)
        0.507780131444
        >>> cmp.dist('aluminum', 'Catalan')
        0.8675933954313434
        >>> cmp.dist('ATCG', 'TAGC')
        0.8077801314441113


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        if src == tar:
            return 0.0

        return self.dist_abs(src, tar) / (
            self._normalizer(
                [
                    self.dist_abs(src, ' ' * len(tar)),
                    self.dist_abs(src, ' ' * len(src)),
                ]
            )
            if self._corpus
            else self._normalizer([len(src), len(tar)])
        )


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# Copyright 2019-2020 by Christopher C. Little.
2		# This file is part of Abydos.
3		#
4		# Abydos is free software: you can redistribute it and/or modify
5		# it under the terms of the GNU General Public License as published by
6		# the Free Software Foundation, either version 3 of the License, or
7		# (at your option) any later version.
8		#
9		# Abydos is distributed in the hope that it will be useful,
10		# but WITHOUT ANY WARRANTY; without even the implied warranty of
11		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12		# GNU General Public License for more details.
13		#
14		# You should have received a copy of the GNU General Public License
15		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17		"""abydos.distance._meta_levenshtein.
18
19	1	Meta-Levenshtein distance
20		"""
21
22		from collections import defaultdict
23		from math import log1p
24	1	from typing import (
25		Any,
26		Callable,
27		DefaultDict,
28		List,
29		Optional,
30		Tuple,
31	1	cast,
32	1	)
33
34	1	from numpy import float_ as np_float
35	1	from numpy import zeros as np_zeros
36
37	1	from ._distance import _Distance
38	1	from ._jaro_winkler import JaroWinkler
39	1	from ..corpus import UnigramCorpus
40	1	from ..tokenizer import QGrams, WhitespaceTokenizer, _Tokenizer
41
42	1	__all__ = ['MetaLevenshtein']
43
44
45	1	class MetaLevenshtein(_Distance):
46		r"""Meta-Levenshtein distance.
47
48		Meta-Levenshtein distance :cite:`Moreau:2008` combines Soft-TFIDF with
49		Levenshtein alignment.
50
51		.. versionadded:: 0.4.0
52		"""
53
54	1	def __init__(
55		self,
56		tokenizer: Optional[_Tokenizer] = None,
57		corpus: Optional[UnigramCorpus] = None,
58		metric: Optional[_Distance] = None,
59		normalizer: Callable[[List[float]], float] = max,
60		**kwargs: Any
61		) -> None:
62		"""Initialize MetaLevenshtein instance.
63
64		Parameters
65		----------
66		tokenizer : _Tokenizer
67		A tokenizer instance from the :py:mod:`abydos.tokenizer` package
68		corpus : UnigramCorpus
69		A unigram corpus :py:class:`UnigramCorpus`. If None, a corpus will
70		be created from the two words when a similarity function is called.
71		metric : _Distance
72		A string distance measure class for making soft matches, by default
73		Jaro-Winkler.
74		normalizer : function
75		A function that takes an list and computes a normalization term
76		by which the edit distance is divided (max by default). Another
77		good option is the sum function.
78		**kwargs
79		Arbitrary keyword arguments
80
81		Other Parameters
82		----------------
83		qval : int
84		The length of each q-gram. Using this parameter and tokenizer=None
85		will cause the instance to use the QGram tokenizer with this
86		q value.
87
88
89		.. versionadded:: 0.4.0
90
91		"""
92	1	super(MetaLevenshtein, self).__init__(**kwargs)
93	1	self._corpus = corpus
94	1	self._metric = JaroWinkler() if metric is None else metric
95	1	self._normalizer = normalizer
96
97	1	qval = 2 if 'qval' not in self.params else self.params['qval']
98	1	self.params['tokenizer'] = (
99		tokenizer
100		if tokenizer is not None
101		else WhitespaceTokenizer()
102		if qval == 0
103		else QGrams(qval=qval, start_stop='$#', skip=0, scaler=None)
104		)
105
106	1	def dist_abs(self, src: str, tar: str) -> float:
107	1	"""Return the Meta-Levenshtein distance of two strings.
108
109	1	Parameters
110		----------
111		src : str
112		Source string for comparison
113		tar : str
114		Target string for comparison
115
116		Returns
117		-------
118		float
119		Meta-Levenshtein distance
120
121		Examples
122		--------
123		>>> cmp = MetaLevenshtein()
124		>>> cmp.dist_abs('cat', 'hat')
125		0.6155602628882225
126		>>> cmp.dist_abs('Niall', 'Neil')
127		2.538900657220556
128		>>> cmp.dist_abs('aluminum', 'Catalan')
129		6.940747163450747
130		>>> cmp.dist_abs('ATCG', 'TAGC')
131		3.2311205257764453
132
133
134		.. versionadded:: 0.4.0
135
136		"""
137		if src == tar:
138		return 0.0
139		if not src:
140	1	return float(len(tar))
141	1	if not tar:
142	1	return float(len(src))
143	1
144	1	src_tok = self.params['tokenizer'].tokenize(src)
145	1	src_ordered = src_tok.get_list()
146		src_tok = src_tok.get_counter()
147	1
148	1	tar_tok = self.params['tokenizer'].tokenize(tar)
149	1	tar_ordered = tar_tok.get_list()
150		tar_tok = tar_tok.get_counter()
151	1
152	1	if self._corpus is None:
153	1	corpus = UnigramCorpus(word_tokenizer=self.params['tokenizer'])
154		corpus.add_document(src)
155	1	corpus.add_document(tar)
156	1	else:
157	1	corpus = self._corpus
158	1
159		dists = defaultdict(float) # type: DefaultDict[Tuple[str, str], float]
160	1	s_toks = set(src_tok.keys())
161		t_toks = set(tar_tok.keys())
162	1	for s_tok in s_toks:
163	1	for t_tok in t_toks:
164	1	dists[(s_tok, t_tok)] = (
165	1	self._metric.dist(s_tok, t_tok) if s_tok != t_tok else 0
166	1	)
167	1
168		vws_dict = {}
169		vwt_dict = {}
170		for token in src_tok.keys():
171	1	vws_dict[token] = log1p(src_tok[token]) * corpus.idf(token)
172	1	for token in tar_tok.keys():
173	1	vwt_dict[token] = log1p(tar_tok[token]) * corpus.idf(token)
174	1
175	1	def _dist(s_tok: str, t_tok: str) -> float:
176	1	return dists[(s_tok, t_tok)] * vws_dict[s_tok] * vwt_dict[t_tok]
177
178	1	d_mat = np_zeros(
179	1	(len(src_ordered) + 1, len(tar_ordered) + 1), dtype=np_float
180		)
181	1	for i in range(len(src_ordered) + 1):
182		d_mat[i, 0] = i
183		for j in range(len(tar_ordered) + 1):
184	1	d_mat[0, j] = j
185	1
186	1	for i in range(len(src_ordered)):
187	1	for j in range(len(tar_ordered)):
188		d_mat[i + 1, j + 1] = min(
189	1	d_mat[i + 1, j] + 1, # ins
190	1	d_mat[i, j + 1] + 1, # del
191	1	d_mat[i, j]
192		+ _dist(src_ordered[i], tar_ordered[j]), # sub/==
193		)
194
195		return cast(float, d_mat[len(src_ordered), len(tar_ordered)])
196
197		def dist(self, src: str, tar: str) -> float:
198	1	"""Return the normalized Levenshtein distance between two strings.
199
200	1	The Levenshtein distance is normalized by dividing the Levenshtein
201		distance (calculated by any of the three supported methods) by the
202		greater of the number of characters in src times the cost of a delete
203		and the number of characters in tar times the cost of an insert.
204		For the case in which all operations have :math:`cost = 1`, this is
205		equivalent to the greater of the length of the two strings src & tar.
206
207		Parameters
208		----------
209		src : str
210		Source string for comparison
211		tar : str
212		Target string for comparison
213
214		Returns
215		-------
216		float
217		The normalized Levenshtein distance between src & tar
218
219		Examples
220		--------
221		>>> cmp = MetaLevenshtein()
222		>>> round(cmp.dist('cat', 'hat'), 12)
223		0.205186754296
224		>>> round(cmp.dist('Niall', 'Neil'), 12)
225		0.507780131444
226		>>> cmp.dist('aluminum', 'Catalan')
227		0.8675933954313434
228		>>> cmp.dist('ATCG', 'TAGC')
229		0.8077801314441113
230
231
232		.. versionadded:: 0.1.0
233		.. versionchanged:: 0.3.6
234		Encapsulated in class
235
236		"""
237		if src == tar:
238		return 0.0
239
240	1	return self.dist_abs(src, tar) / (
241	1	self._normalizer(
242		[
243	1	self.dist_abs(src, ' ' * len(tar)),
244		self.dist_abs(src, ' ' * len(src)),
245		]
246		)
247		if self._corpus
248		else self._normalizer([len(src), len(tar)])
249		)
250
251
252		if __name__ == '__main__':
253		import doctest
254
255		doctest.testmod()
256

chrislit / abydos

Push — master ( 416c2f...9ec382 )

abydos.distance._meta_levenshtein A

Complexity

Size/Duplication

Test Coverage

Importance

3 Methods

Duplication Side-by-Side

Filter issues like