abydos.distance._sift4.sift4_simplest() - Code Metrics - Inspection of "78a222a9f7d8976f6744d263e3d6d01a2a991c27" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Branch — master (78a222)

by Chris

created 2018-10-26 11:30 UTC

abydos.distance._sift4.sift4_simplest() F

↳ Parent: abydos.distance._sift4

Complexity

Conditions

Size

Total Lines	64
Code Lines	35

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	33
CRAP Score	14

Importance

Changes

Metric	Value
eloc	35
dl	0
loc	64
ccs	33
cts	33
cp	1
rs	3.6
c	0
b	0
f	0
cc	14
nop	3
crap	14

How to fix Long Method Complexity

# -*- coding: utf-8 -*-

# Copyright 2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance.sift4.

The distance.sift4 module implements Sift4 approximate string distance
functions.
"""

from __future__ import division, unicode_literals

from six.moves import range


__all__ = ['dist_sift4', 'sift4_common', 'sift4_simplest', 'sim_sift4']


def sift4_simplest(src, tar, max_offset=5):
    """Return the "simplest" Sift4 distance between two terms.

    This is an approximation of edit distance, described in
    :cite:`Zackwehdex:2014`.

    :param str src: source string for comparison
    :param str tar: target string for comparison
    :param max_offset: the number of characters to search for matching letters
    :returns: the Sift4 distance according to the simplest formula
    :rtype: int

    >>> sift4_simplest('cat', 'hat')
    1
    >>> sift4_simplest('Niall', 'Neil')
    2
    >>> sift4_simplest('Colin', 'Cuilen')
    3
    >>> sift4_simplest('ATCG', 'TAGC')
    2
    """
    if not src:
        return len(tar)

    if not tar:
        return len(src)

    src_len = len(src)
    tar_len = len(tar)

    src_cur = 0
    tar_cur = 0
    lcss = 0
    local_cs = 0

    while (src_cur < src_len) and (tar_cur < tar_len):
        if src[src_cur] == tar[tar_cur]:
            local_cs += 1
        else:
            lcss += local_cs
            local_cs = 0
            if src_cur != tar_cur:
                src_cur = tar_cur = max(src_cur, tar_cur)
            for i in range(max_offset):
                if not ((src_cur + i < src_len) or (tar_cur + i < tar_len)):
                    break
                if (src_cur + i < src_len) and (
                    src[src_cur + i] == tar[tar_cur]

                ):
                    src_cur += i
                    local_cs += 1
                    break
                if (tar_cur + i < tar_len) and (
                    src[src_cur] == tar[tar_cur + i]

                ):
                    tar_cur += i
                    local_cs += 1
                    break

        src_cur += 1
        tar_cur += 1

    lcss += local_cs
    return round(max(src_len, tar_len) - lcss)


def sift4_common(src, tar, max_offset=5, max_distance=0):

    """Return the "common" Sift4 distance between two terms.

    This is an approximation of edit distance, described in
    :cite:`Zackwehdex:2014`.

    :param str src: source string for comparison
    :param str tar: target string for comparison
    :param max_offset: the number of characters to search for matching letters
    :param max_distance: the distance at which to stop and exit
    :returns: the Sift4 distance according to the common formula
    :rtype: int

    >>> sift4_common('cat', 'hat')
    1
    >>> sift4_common('Niall', 'Neil')
    2
    >>> sift4_common('Colin', 'Cuilen')
    3
    >>> sift4_common('ATCG', 'TAGC')
    2
    """
    if not src:
        return len(tar)

    if not tar:
        return len(src)

    src_len = len(src)
    tar_len = len(tar)

    src_cur = 0
    tar_cur = 0
    lcss = 0
    local_cs = 0
    trans = 0
    offset_arr = []

    while (src_cur < src_len) and (tar_cur < tar_len):
        if src[src_cur] == tar[tar_cur]:
            local_cs += 1
            is_trans = False
            i = 0
            while i < len(offset_arr):
                ofs = offset_arr[i]
                if src_cur <= ofs['src_cur'] or tar_cur <= ofs['tar_cur']:
                    is_trans = abs(tar_cur - src_cur) >= abs(
                        ofs['tar_cur'] - ofs['src_cur']
                    )
                    if is_trans:
                        trans += 1
                    elif not ofs['trans']:
                        ofs['trans'] = True
                        trans += 1
                    break
                elif src_cur > ofs['tar_cur'] and tar_cur > ofs['src_cur']:
                    del offset_arr[i]
                else:
                    i += 1

            offset_arr.append(
                {'src_cur': src_cur, 'tar_cur': tar_cur, 'trans': is_trans}
            )
        else:
            lcss += local_cs
            local_cs = 0
            if src_cur != tar_cur:
                src_cur = tar_cur = min(src_cur, tar_cur)
            for i in range(max_offset):
                if not ((src_cur + i < src_len) or (tar_cur + i < tar_len)):
                    break
                if (src_cur + i < src_len) and (
                    src[src_cur + i] == tar[tar_cur]

                ):
                    src_cur += i - 1
                    tar_cur -= 1
                    break
                if (tar_cur + i < tar_len) and (
                    src[src_cur] == tar[tar_cur + i]

                ):
                    src_cur -= 1
                    tar_cur += i - 1
                    break

        src_cur += 1
        tar_cur += 1

        if max_distance:
            temporary_distance = max(src_cur, tar_cur) - lcss + trans
            if temporary_distance >= max_distance:
                return round(temporary_distance)

        if (src_cur >= src_len) or (tar_cur >= tar_len):
            lcss += local_cs
            local_cs = 0
            src_cur = tar_cur = min(src_cur, tar_cur)

    lcss += local_cs
    return round(max(src_len, tar_len) - lcss + trans)


def dist_sift4(src, tar, max_offset=5, max_distance=0):
    """Return the normalized "common" Sift4 distance between two terms.

    This is Sift4 distance, normalized to [0, 1].

    :param str src: source string for comparison
    :param str tar: target string for comparison
    :param max_offset: the number of characters to search for matching letters
    :param max_distance: the distance at which to stop and exit
    :returns: the normalized Sift4 distance
    :rtype: float

    >>> round(dist_sift4('cat', 'hat'), 12)
    0.333333333333
    >>> dist_sift4('Niall', 'Neil')
    0.4
    >>> dist_sift4('Colin', 'Cuilen')
    0.5
    >>> dist_sift4('ATCG', 'TAGC')
    0.5
    """
    return sift4_common(src, tar, max_offset, max_distance) / (
        max(len(src), len(tar), 1)
    )


def sim_sift4(src, tar, max_offset=5, max_distance=0):
    """Return the normalized "common" Sift4 similarity of two terms.

    Normalized Sift4 similarity is the complement of normalized Sift4 distance:
    :math:`sim_{Sift4} = 1 - dist_{Sift4}`.

    :param str src: source string for comparison
    :param str tar: target string for comparison
    :param max_offset: the number of characters to search for matching letters
    :param max_distance: the distance at which to stop and exit
    :returns: the normalized Sift4 similarity
    :rtype: float

    >>> round(sim_sift4('cat', 'hat'), 12)
    0.666666666667
    >>> sim_sift4('Niall', 'Neil')
    0.6
    >>> sim_sift4('Colin', 'Cuilen')
    0.5
    >>> sim_sift4('ATCG', 'TAGC')
    0.5
    """
    return 1 - dist_sift4(src, tar, max_offset, max_distance)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.distance.sift4.
20
21		The distance.sift4 module implements Sift4 approximate string distance
22		functions.
23		"""
24
25	1	from __future__ import division, unicode_literals
26
27	1	from six.moves import range
28
29
30	1	__all__ = ['dist_sift4', 'sift4_common', 'sift4_simplest', 'sim_sift4']
31
32
33	1	def sift4_simplest(src, tar, max_offset=5):
34		"""Return the "simplest" Sift4 distance between two terms.
35
36		This is an approximation of edit distance, described in
37		:cite:`Zackwehdex:2014`.
38
39		:param str src: source string for comparison
40		:param str tar: target string for comparison
41		:param max_offset: the number of characters to search for matching letters
42		:returns: the Sift4 distance according to the simplest formula
43		:rtype: int
44
45		>>> sift4_simplest('cat', 'hat')
46		1
47		>>> sift4_simplest('Niall', 'Neil')
48		2
49		>>> sift4_simplest('Colin', 'Cuilen')
50		3
51		>>> sift4_simplest('ATCG', 'TAGC')
52		2
53		"""
54	1	if not src:
55	1	return len(tar)
56
57	1	if not tar:
58	1	return len(src)
59
60	1	src_len = len(src)
61	1	tar_len = len(tar)
62
63	1	src_cur = 0
64	1	tar_cur = 0
65	1	lcss = 0
66	1	local_cs = 0
67
68	1	while (src_cur < src_len) and (tar_cur < tar_len):
69	1	if src[src_cur] == tar[tar_cur]:
70	1	local_cs += 1
71		else:
72	1	lcss += local_cs
73	1	local_cs = 0
74	1	if src_cur != tar_cur:
75	1	src_cur = tar_cur = max(src_cur, tar_cur)
76	1	for i in range(max_offset):
77	1	if not ((src_cur + i < src_len) or (tar_cur + i < tar_len)):
78	1	break
79	1	if (src_cur + i < src_len) and (
80		src[src_cur + i] == tar[tar_cur]
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
81		):
82	1	src_cur += i
83	1	local_cs += 1
84	1	break
85	1	if (tar_cur + i < tar_len) and (
86		src[src_cur] == tar[tar_cur + i]
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
87		):
88	1	tar_cur += i
89	1	local_cs += 1
90	1	break
91
92	1	src_cur += 1
93	1	tar_cur += 1
94
95	1	lcss += local_cs
96	1	return round(max(src_len, tar_len) - lcss)
97
98
99	1	def sift4_common(src, tar, max_offset=5, max_distance=0):
		0 ignored issues – show Comprehensibility introduced 2018-08-15 22:53 UTC by Report Bug Copy Issue Report This function exceeds the maximum number of variables (16/15). Loading history...
100		"""Return the "common" Sift4 distance between two terms.
101
102		This is an approximation of edit distance, described in
103		:cite:`Zackwehdex:2014`.
104
105		:param str src: source string for comparison
106		:param str tar: target string for comparison
107		:param max_offset: the number of characters to search for matching letters
108		:param max_distance: the distance at which to stop and exit
109		:returns: the Sift4 distance according to the common formula
110		:rtype: int
111
112		>>> sift4_common('cat', 'hat')
113		1
114		>>> sift4_common('Niall', 'Neil')
115		2
116		>>> sift4_common('Colin', 'Cuilen')
117		3
118		>>> sift4_common('ATCG', 'TAGC')
119		2
120		"""
121	1	if not src:
122	1	return len(tar)
123
124	1	if not tar:
125	1	return len(src)
126
127	1	src_len = len(src)
128	1	tar_len = len(tar)
129
130	1	src_cur = 0
131	1	tar_cur = 0
132	1	lcss = 0
133	1	local_cs = 0
134	1	trans = 0
135	1	offset_arr = []
136
137	1	while (src_cur < src_len) and (tar_cur < tar_len):
138	1	if src[src_cur] == tar[tar_cur]:
139	1	local_cs += 1
140	1	is_trans = False
141	1	i = 0
142	1	while i < len(offset_arr):
143	1	ofs = offset_arr[i]
144	1	if src_cur <= ofs['src_cur'] or tar_cur <= ofs['tar_cur']:
145	1	is_trans = abs(tar_cur - src_cur) >= abs(
146		ofs['tar_cur'] - ofs['src_cur']
147		)
148	1	if is_trans:
149	1	trans += 1
150	1	elif not ofs['trans']:
151	1	ofs['trans'] = True
152	1	trans += 1
153	1	break
154	1	elif src_cur > ofs['tar_cur'] and tar_cur > ofs['src_cur']:
155	1	del offset_arr[i]
156		else:
157	1	i += 1
158
159	1	offset_arr.append(
160		{'src_cur': src_cur, 'tar_cur': tar_cur, 'trans': is_trans}
161		)
162		else:
163	1	lcss += local_cs
164	1	local_cs = 0
165	1	if src_cur != tar_cur:
166	1	src_cur = tar_cur = min(src_cur, tar_cur)
167	1	for i in range(max_offset):
168	1	if not ((src_cur + i < src_len) or (tar_cur + i < tar_len)):
169	1	break
170	1	if (src_cur + i < src_len) and (
171		src[src_cur + i] == tar[tar_cur]
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
172		):
173	1	src_cur += i - 1
174	1	tar_cur -= 1
175	1	break
176	1	if (tar_cur + i < tar_len) and (
177		src[src_cur] == tar[tar_cur + i]
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
178		):
179	1	src_cur -= 1
180	1	tar_cur += i - 1
181	1	break
182
183	1	src_cur += 1
184	1	tar_cur += 1
185
186	1	if max_distance:
187	1	temporary_distance = max(src_cur, tar_cur) - lcss + trans
188	1	if temporary_distance >= max_distance:
189	1	return round(temporary_distance)
190
191	1	if (src_cur >= src_len) or (tar_cur >= tar_len):
192	1	lcss += local_cs
193	1	local_cs = 0
194	1	src_cur = tar_cur = min(src_cur, tar_cur)
195
196	1	lcss += local_cs
197	1	return round(max(src_len, tar_len) - lcss + trans)
198
199
200	1	def dist_sift4(src, tar, max_offset=5, max_distance=0):
201		"""Return the normalized "common" Sift4 distance between two terms.
202
203		This is Sift4 distance, normalized to [0, 1].
204
205		:param str src: source string for comparison
206		:param str tar: target string for comparison
207		:param max_offset: the number of characters to search for matching letters
208		:param max_distance: the distance at which to stop and exit
209		:returns: the normalized Sift4 distance
210		:rtype: float
211
212		>>> round(dist_sift4('cat', 'hat'), 12)
213		0.333333333333
214		>>> dist_sift4('Niall', 'Neil')
215		0.4
216		>>> dist_sift4('Colin', 'Cuilen')
217		0.5
218		>>> dist_sift4('ATCG', 'TAGC')
219		0.5
220		"""
221	1	return sift4_common(src, tar, max_offset, max_distance) / (
222		max(len(src), len(tar), 1)
223		)
224
225
226	1	def sim_sift4(src, tar, max_offset=5, max_distance=0):
227		"""Return the normalized "common" Sift4 similarity of two terms.
228
229		Normalized Sift4 similarity is the complement of normalized Sift4 distance:
230		:math:`sim_{Sift4} = 1 - dist_{Sift4}`.
231
232		:param str src: source string for comparison
233		:param str tar: target string for comparison
234		:param max_offset: the number of characters to search for matching letters
235		:param max_distance: the distance at which to stop and exit
236		:returns: the normalized Sift4 similarity
237		:rtype: float
238
239		>>> round(sim_sift4('cat', 'hat'), 12)
240		0.666666666667
241		>>> sim_sift4('Niall', 'Neil')
242		0.6
243		>>> sim_sift4('Colin', 'Cuilen')
244		0.5
245		>>> sim_sift4('ATCG', 'TAGC')
246		0.5
247		"""
248	1	return 1 - dist_sift4(src, tar, max_offset, max_distance)
249
250
251		if __name__ == '__main__':
252		import doctest
253
254		doctest.testmod()
255

chrislit / abydos

Branch — master (78a222)

abydos.distance._sift4.sift4_simplest() F

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like