abydos.distance._baystat.sim_baystat() - Code Metrics - Inspection of "78a222a9f7d8976f6744d263e3d6d01a2a991c27" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Branch — master (78a222)

by Chris

created 2018-10-26 11:30 UTC

abydos.distance._baystat.sim_baystat() F

↳ Parent: abydos.distance._baystat

Complexity

Conditions

Size

Total Lines	114
Code Lines	53

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	47
CRAP Score	18

Importance

Changes

Metric	Value
eloc	53
dl	0
loc	114
ccs	47
cts	47
cp	1
rs	1.2
c	0
b	0
f	0
cc	18
nop	5
crap	18

How to fix Long Method Complexity

# -*- coding: utf-8 -*-

# Copyright 2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance.baystat.

The distance.baystat module implements BayStat similarity.
"""

from __future__ import division, unicode_literals

__all__ = ['dist_baystat', 'sim_baystat']


def sim_baystat(src, tar, min_ss_len=None, left_ext=None, right_ext=None):

    """Return the Baystat similarity.

    Good results for shorter words are reported when setting min_ss_len to 1
    and either left_ext OR right_ext to 1.

    The Baystat similarity is defined in :cite:`Furnohr:2002`.

    This is ostensibly a port of the R module PPRL's implementation:
    https://github.com/cran/PPRL/blob/master/src/MTB_Baystat.cpp
    :cite:`Rukasz:2018`. As such, this could be made more pythonic.

    :param str src: source string for comparison
    :param str tar: target string for comparison
    :param int min_ss_len: minimum substring length to be considered
    :param int left_ext: left-side extension length
    :param int right_ext: right-side extension length
    :returns: the Baystat similarity
    :rtype: float

    >>> round(sim_baystat('cat', 'hat'), 12)
    0.666666666667
    >>> sim_baystat('Niall', 'Neil')
    0.4
    >>> round(sim_baystat('Colin', 'Cuilen'), 12)
    0.166666666667
    >>> sim_baystat('ATCG', 'TAGC')
    0.0
    """
    if src == tar:
        return 1
    if not src or not tar:
        return 0

    max_len = max(len(src), len(tar))

    if not (min_ss_len and left_ext and right_ext):
        # These can be set via arguments to the function. Otherwise they are
        # set automatically based on values from the article.
        if max_len >= 7:
            min_ss_len = 2
            left_ext = 2
            right_ext = 2
        else:
            # The paper suggests that for short names, (exclusively) one or the
            # other of left_ext and right_ext can be 1, with good results.
            # I use 0 & 0 as the default in this case.
            min_ss_len = 1
            left_ext = 0
            right_ext = 0

    pos = 0
    match_len = 0

    while True:
        if pos + min_ss_len > len(src):
            return match_len / max_len

        hit_len = 0
        ix = 1


        substring = src[pos : pos + min_ss_len]
        search_begin = pos - left_ext

        if search_begin < 0:
            search_begin = 0
            left_ext_len = pos
        else:
            left_ext_len = left_ext

        if pos + min_ss_len + right_ext >= len(tar):
            right_ext_len = len(tar) - pos - min_ss_len
        else:
            right_ext_len = right_ext

        if (
            search_begin + left_ext_len + min_ss_len + right_ext_len

            > search_begin

        ):
            search_val = tar[
                search_begin : (
                    search_begin + left_ext_len + min_ss_len + right_ext_len
                )
            ]
        else:
            search_val = ''

        flagged_tar = ''
        while substring in search_val and pos + ix <= len(src):
            hit_len = len(substring)
            flagged_tar = tar.replace(substring, '#' * hit_len)

            if pos + min_ss_len + ix <= len(src):
                substring = src[pos : pos + min_ss_len + ix]

            if pos + min_ss_len + right_ext_len + 1 <= len(tar):
                right_ext_len += 1

            # The following is unnecessary, I think
            # if (search_begin + left_ext_len + min_ss_len + right_ext_len <=
            #         len(tar)):
            search_val = tar[
                search_begin : (
                    search_begin + left_ext_len + min_ss_len + right_ext_len
                )
            ]

            ix += 1


        if hit_len > 0:
            tar = flagged_tar

        match_len += hit_len
        pos += ix


def dist_baystat(src, tar, min_ss_len=None, left_ext=None, right_ext=None):
    """Return the Baystat distance.

    Normalized Baystat similarity is the complement of normalized Baystat
    distance: :math:`sim_{Baystat} = 1 - dist_{Baystat}`.

    :param str src: source string for comparison
    :param str tar: target string for comparison
    :param int min_ss_len: minimum substring length to be considered
    :param int left_ext: left-side extension length
    :param int right_ext: right-side extension length
    :returns: the Baystat distance
    :rtype: float

    >>> round(dist_baystat('cat', 'hat'), 12)
    0.333333333333
    >>> dist_baystat('Niall', 'Neil')
    0.6
    >>> round(dist_baystat('Colin', 'Cuilen'), 12)
    0.833333333333
    >>> dist_baystat('ATCG', 'TAGC')
    1.0
    """
    return 1 - sim_baystat(src, tar, min_ss_len, left_ext, right_ext)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.distance.baystat.
20
21		The distance.baystat module implements BayStat similarity.
22		"""
23
24	1	from __future__ import division, unicode_literals
25
26	1	__all__ = ['dist_baystat', 'sim_baystat']
27
28
29	1	def sim_baystat(src, tar, min_ss_len=None, left_ext=None, right_ext=None):
		0 ignored issues – show Comprehensibility introduced 2018-08-27 05:44 UTC by Report Bug Copy Issue Report This function exceeds the maximum number of variables (16/15). Loading history...
30		"""Return the Baystat similarity.
31
32		Good results for shorter words are reported when setting min_ss_len to 1
33		and either left_ext OR right_ext to 1.
34
35		The Baystat similarity is defined in :cite:`Furnohr:2002`.
36
37		This is ostensibly a port of the R module PPRL's implementation:
38		https://github.com/cran/PPRL/blob/master/src/MTB_Baystat.cpp
39		:cite:`Rukasz:2018`. As such, this could be made more pythonic.
40
41		:param str src: source string for comparison
42		:param str tar: target string for comparison
43		:param int min_ss_len: minimum substring length to be considered
44		:param int left_ext: left-side extension length
45		:param int right_ext: right-side extension length
46		:returns: the Baystat similarity
47		:rtype: float
48
49		>>> round(sim_baystat('cat', 'hat'), 12)
50		0.666666666667
51		>>> sim_baystat('Niall', 'Neil')
52		0.4
53		>>> round(sim_baystat('Colin', 'Cuilen'), 12)
54		0.166666666667
55		>>> sim_baystat('ATCG', 'TAGC')
56		0.0
57		"""
58	1	if src == tar:
59	1	return 1
60	1	if not src or not tar:
61	1	return 0
62
63	1	max_len = max(len(src), len(tar))
64
65	1	if not (min_ss_len and left_ext and right_ext):
66		# These can be set via arguments to the function. Otherwise they are
67		# set automatically based on values from the article.
68	1	if max_len >= 7:
69	1	min_ss_len = 2
70	1	left_ext = 2
71	1	right_ext = 2
72		else:
73		# The paper suggests that for short names, (exclusively) one or the
74		# other of left_ext and right_ext can be 1, with good results.
75		# I use 0 & 0 as the default in this case.
76	1	min_ss_len = 1
77	1	left_ext = 0
78	1	right_ext = 0
79
80	1	pos = 0
81	1	match_len = 0
82
83	1	while True:
84	1	if pos + min_ss_len > len(src):
85	1	return match_len / max_len
86
87	1	hit_len = 0
88	1	ix = 1
		0 ignored issues – show Coding Style Naming introduced 2018-10-23 05:52 UTC by Report Bug Copy Issue Report The name `ix` does not conform to the variable naming conventions (`(([a-z][a-z0-9_]{2,30})\|(_[a-z0-9_]*))$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
89
90	1	substring = src[pos : pos + min_ss_len]
91	1	search_begin = pos - left_ext
92
93	1	if search_begin < 0:
94	1	search_begin = 0
95	1	left_ext_len = pos
96		else:
97	1	left_ext_len = left_ext
98
99	1	if pos + min_ss_len + right_ext >= len(tar):
100	1	right_ext_len = len(tar) - pos - min_ss_len
101		else:
102	1	right_ext_len = right_ext
103
104	1	if (
105		search_begin + left_ext_len + min_ss_len + right_ext_len
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
106		> search_begin
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
107		):
108	1	search_val = tar[
109		search_begin : (
110		search_begin + left_ext_len + min_ss_len + right_ext_len
111		)
112		]
113		else:
114	1	search_val = ''
115
116	1	flagged_tar = ''
117	1	while substring in search_val and pos + ix <= len(src):
118	1	hit_len = len(substring)
119	1	flagged_tar = tar.replace(substring, '#' * hit_len)
120
121	1	if pos + min_ss_len + ix <= len(src):
122	1	substring = src[pos : pos + min_ss_len + ix]
123
124	1	if pos + min_ss_len + right_ext_len + 1 <= len(tar):
125	1	right_ext_len += 1
126
127		# The following is unnecessary, I think
128		# if (search_begin + left_ext_len + min_ss_len + right_ext_len <=
129		# len(tar)):
130	1	search_val = tar[
131		search_begin : (
132		search_begin + left_ext_len + min_ss_len + right_ext_len
133		)
134		]
135
136	1	ix += 1
		0 ignored issues – show Coding Style Naming introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report The name `ix` does not conform to the variable naming conventions (`(([a-z][a-z0-9_]{2,30})\|(_[a-z0-9_]*))$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
137
138	1	if hit_len > 0:
139	1	tar = flagged_tar
140
141	1	match_len += hit_len
142	1	pos += ix
143
144
145	1	def dist_baystat(src, tar, min_ss_len=None, left_ext=None, right_ext=None):
146		"""Return the Baystat distance.
147
148		Normalized Baystat similarity is the complement of normalized Baystat
149		distance: :math:`sim_{Baystat} = 1 - dist_{Baystat}`.
150
151		:param str src: source string for comparison
152		:param str tar: target string for comparison
153		:param int min_ss_len: minimum substring length to be considered
154		:param int left_ext: left-side extension length
155		:param int right_ext: right-side extension length
156		:returns: the Baystat distance
157		:rtype: float
158
159		>>> round(dist_baystat('cat', 'hat'), 12)
160		0.333333333333
161		>>> dist_baystat('Niall', 'Neil')
162		0.6
163		>>> round(dist_baystat('Colin', 'Cuilen'), 12)
164		0.833333333333
165		>>> dist_baystat('ATCG', 'TAGC')
166		1.0
167		"""
168	1	return 1 - sim_baystat(src, tar, min_ss_len, left_ext, right_ext)
169
170
171		if __name__ == '__main__':
172		import doctest
173
174		doctest.testmod()
175

chrislit / abydos

Branch — master (78a222)

abydos.distance._baystat.sim_baystat() F

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like