abydos.distance._sift4.Sift4.dist_abs() - Code Metrics - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

abydos.distance._sift4.Sift4.dist_abs() F
last analyzed 2020-12-31 20:10 UTC

↳ Parent: abydos.distance._sift4

Complexity

Conditions

Size

Total Lines	112
Code Lines	64

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	54
CRAP Score	25

Importance

Changes

Metric	Value
eloc	64
dl	0
loc	112
ccs	54
cts	54
cp	1
rs	0
c	0
b	0
f	0
cc	25
nop	3
crap	25

How to fix Long Method Complexity

# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._sift4.

Sift4 Common approximate string distance
"""

from typing import Any, Dict, List, Union

from ._distance import _Distance

__all__ = ['Sift4']


class Sift4(_Distance):
    """Sift4 Common version.

    This is an approximation of edit distance, described in
    :cite:`Zackwehdex:2014`.

    .. versionadded:: 0.3.6
    """

    def __init__(
        self, max_offset: int = 5, max_distance: int = 0, **kwargs: Any
    ) -> None:
        """Initialize Sift4 instance.

        Parameters
        ----------
        max_offset : int
            The number of characters to search for matching letters
        max_distance : int
            The distance at which to stop and exit
        **kwargs
            Arbitrary keyword arguments


        .. versionadded:: 0.4.0

        """
        super(Sift4, self).__init__(**kwargs)
        self._max_offset = max_offset
        self._max_distance = max_distance

    def dist_abs(self, src: str, tar: str) -> float:
        """Return the "common" Sift4 distance between two terms.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        int
            The Sift4 distance according to the common formula

        Examples
        --------
        >>> cmp = Sift4()
        >>> cmp.dist_abs('cat', 'hat')
        1
        >>> cmp.dist_abs('Niall', 'Neil')
        2
        >>> cmp.dist_abs('Colin', 'Cuilen')
        3
        >>> cmp.dist_abs('ATCG', 'TAGC')
        2


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        if not src:
            return len(tar)

        if not tar:
            return len(src)

        src_len = len(src)
        tar_len = len(tar)

        src_cur = 0
        tar_cur = 0
        lcss = 0
        local_cs = 0
        trans = 0
        offset_arr = []  # type: List[Dict[str, Union[int, bool]]]

        while (src_cur < src_len) and (tar_cur < tar_len):
            if src[src_cur] == tar[tar_cur]:
                local_cs += 1
                is_trans = False
                i = 0
                while i < len(offset_arr):
                    ofs = offset_arr[i]
                    if src_cur <= ofs['src_cur'] or tar_cur <= ofs['tar_cur']:
                        is_trans = abs(tar_cur - src_cur) >= abs(
                            ofs['tar_cur'] - ofs['src_cur']
                        )
                        if is_trans:
                            trans += 1
                        elif not ofs['trans']:
                            ofs['trans'] = True
                            trans += 1
                        break
                    elif src_cur > ofs['tar_cur'] and tar_cur > ofs['src_cur']:
                        del offset_arr[i]
                    else:
                        i += 1

                offset_arr.append(
                    {'src_cur': src_cur, 'tar_cur': tar_cur, 'trans': is_trans}
                )
            else:
                lcss += local_cs
                local_cs = 0
                if src_cur != tar_cur:
                    src_cur = tar_cur = min(src_cur, tar_cur)
                for i in range(self._max_offset):
                    if not (
                        (src_cur + i < src_len) or (tar_cur + i < tar_len)
                    ):
                        break
                    if (src_cur + i < src_len) and (
                        src[src_cur + i] == tar[tar_cur]
                    ):
                        src_cur += i - 1
                        tar_cur -= 1
                        break
                    if (tar_cur + i < tar_len) and (
                        src[src_cur] == tar[tar_cur + i]
                    ):
                        src_cur -= 1
                        tar_cur += i - 1
                        break

            src_cur += 1
            tar_cur += 1

            if self._max_distance:
                temporary_distance = max(src_cur, tar_cur) - lcss + trans
                if temporary_distance >= self._max_distance:
                    return round(temporary_distance)

            if (src_cur >= src_len) or (tar_cur >= tar_len):
                lcss += local_cs
                local_cs = 0
                src_cur = tar_cur = min(src_cur, tar_cur)

        lcss += local_cs
        return round(max(src_len, tar_len) - lcss + trans)

    def dist(self, src: str, tar: str) -> float:
        """Return the normalized "common" Sift4 distance between two terms.

        This is Sift4 distance, normalized to [0, 1].

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            The normalized Sift4 distance

        Examples
        --------
        >>> cmp = Sift4()
        >>> round(cmp.dist('cat', 'hat'), 12)
        0.333333333333
        >>> cmp.dist('Niall', 'Neil')
        0.4
        >>> cmp.dist('Colin', 'Cuilen')
        0.5
        >>> cmp.dist('ATCG', 'TAGC')
        0.5


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        return self.dist_abs(src, tar) / (max(len(src), len(tar), 1))


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# Copyright 2018-2020 by Christopher C. Little.
2		# This file is part of Abydos.
3		#
4		# Abydos is free software: you can redistribute it and/or modify
5		# it under the terms of the GNU General Public License as published by
6		# the Free Software Foundation, either version 3 of the License, or
7		# (at your option) any later version.
8		#
9		# Abydos is distributed in the hope that it will be useful,
10		# but WITHOUT ANY WARRANTY; without even the implied warranty of
11		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12		# GNU General Public License for more details.
13		#
14		# You should have received a copy of the GNU General Public License
15		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17		"""abydos.distance._sift4.
18
19	1	Sift4 Common approximate string distance
20		"""
21
22		from typing import Any, Dict, List, Union
23
24	1	from ._distance import _Distance
25
26		__all__ = ['Sift4']
27
28
29		class Sift4(_Distance):
30		"""Sift4 Common version.
31	1
32		This is an approximation of edit distance, described in
33	1	:cite:`Zackwehdex:2014`.
34
35	1	.. versionadded:: 0.3.6
36	1	"""
37
38	1	def __init__(
39		self, max_offset: int = 5, max_distance: int = 0, **kwargs: Any
40		) -> None:
41	1	"""Initialize Sift4 instance.
42
43		Parameters
44		----------
45		max_offset : int
46		The number of characters to search for matching letters
47		max_distance : int
48		The distance at which to stop and exit
49		**kwargs
50	1	Arbitrary keyword arguments
51
52
53		.. versionadded:: 0.4.0
54
55		"""
56		super(Sift4, self).__init__(**kwargs)
57		self._max_offset = max_offset
58		self._max_distance = max_distance
59
60		def dist_abs(self, src: str, tar: str) -> float:
61		"""Return the "common" Sift4 distance between two terms.
62
63		Parameters
64		----------
65		src : str
66	1	Source string for comparison
67	1	tar : str
68	1	Target string for comparison
69
70	1	Returns
71		-------
72		int
73		The Sift4 distance according to the common formula
74
75		Examples
76		--------
77		>>> cmp = Sift4()
78		>>> cmp.dist_abs('cat', 'hat')
79		1
80		>>> cmp.dist_abs('Niall', 'Neil')
81		2
82		>>> cmp.dist_abs('Colin', 'Cuilen')
83		3
84		>>> cmp.dist_abs('ATCG', 'TAGC')
85		2
86
87
88		.. versionadded:: 0.3.0
89		.. versionchanged:: 0.3.6
90		Encapsulated in class
91
92		"""
93		if not src:
94		return len(tar)
95
96		if not tar:
97		return len(src)
98
99		src_len = len(src)
100		tar_len = len(tar)
101
102		src_cur = 0
103	1	tar_cur = 0
104	1	lcss = 0
105		local_cs = 0
106	1	trans = 0
107	1	offset_arr = [] # type: List[Dict[str, Union[int, bool]]]
108
109	1	while (src_cur < src_len) and (tar_cur < tar_len):
110	1	if src[src_cur] == tar[tar_cur]:
111		local_cs += 1
112	1	is_trans = False
113	1	i = 0
114	1	while i < len(offset_arr):
115	1	ofs = offset_arr[i]
116	1	if src_cur <= ofs['src_cur'] or tar_cur <= ofs['tar_cur']:
117	1	is_trans = abs(tar_cur - src_cur) >= abs(
118		ofs['tar_cur'] - ofs['src_cur']
119	1	)
120	1	if is_trans:
121	1	trans += 1
122	1	elif not ofs['trans']:
123	1	ofs['trans'] = True
124	1	trans += 1
125	1	break
126	1	elif src_cur > ofs['tar_cur'] and tar_cur > ofs['src_cur']:
127	1	del offset_arr[i]
128		else:
129		i += 1
130	1
131	1	offset_arr.append(
132	1	{'src_cur': src_cur, 'tar_cur': tar_cur, 'trans': is_trans}
133	1	)
134	1	else:
135	1	lcss += local_cs
136	1	local_cs = 0
137	1	if src_cur != tar_cur:
138		src_cur = tar_cur = min(src_cur, tar_cur)
139	1	for i in range(self._max_offset):
140		if not (
141	1	(src_cur + i < src_len) or (tar_cur + i < tar_len)
142		):
143		break
144		if (src_cur + i < src_len) and (
145	1	src[src_cur + i] == tar[tar_cur]
146	1	):
147	1	src_cur += i - 1
148	1	tar_cur -= 1
149	1	break
150	1	if (tar_cur + i < tar_len) and (
151		src[src_cur] == tar[tar_cur + i]
152		):
153	1	src_cur -= 1
154	1	tar_cur += i - 1
155		break
156
157	1	src_cur += 1
158	1	tar_cur += 1
159	1
160	1	if self._max_distance:
161		temporary_distance = max(src_cur, tar_cur) - lcss + trans
162		if temporary_distance >= self._max_distance:
163	1	return round(temporary_distance)
164	1
165	1	if (src_cur >= src_len) or (tar_cur >= tar_len):
166		lcss += local_cs
167	1	local_cs = 0
168	1	src_cur = tar_cur = min(src_cur, tar_cur)
169
170	1	lcss += local_cs
171	1	return round(max(src_len, tar_len) - lcss + trans)
172	1
173	1	def dist(self, src: str, tar: str) -> float:
174		"""Return the normalized "common" Sift4 distance between two terms.
175	1
176	1	This is Sift4 distance, normalized to [0, 1].
177	1
178	1	Parameters
179		----------
180	1	src : str
181	1	Source string for comparison
182		tar : str
183	1	Target string for comparison
184
185		Returns
186		-------
187		float
188		The normalized Sift4 distance
189
190		Examples
191		--------
192		>>> cmp = Sift4()
193		>>> round(cmp.dist('cat', 'hat'), 12)
194		0.333333333333
195		>>> cmp.dist('Niall', 'Neil')
196		0.4
197		>>> cmp.dist('Colin', 'Cuilen')
198		0.5
199		>>> cmp.dist('ATCG', 'TAGC')
200		0.5
201
202
203		.. versionadded:: 0.3.0
204		.. versionchanged:: 0.3.6
205		Encapsulated in class
206
207		"""
208		return self.dist_abs(src, tar) / (max(len(src), len(tar), 1))
209
210
211		if __name__ == '__main__':
212		import doctest
213
214		doctest.testmod()
215

chrislit / abydos

abydos.distance._sift4.Sift4.dist_abs() F last analyzed 2020-12-31 20:10 UTC

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like

abydos.distance._sift4.Sift4.dist_abs() F
last analyzed 2020-12-31 20:10 UTC