abydos.distance._ssk - Code Metrics - Inspection of "Merge pull request #240 from chrislit/0.4.1" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 643512...2b6b3e )

by Chris

created 2020-01-06 02:41 UTC

abydos.distance._ssk A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	184
Duplicated Lines	0 %

Test Coverage

Coverage

100%

Importance

Changes

Metric	Value
wmc	7
eloc	44
dl	0
loc	184
ccs	27
cts	27
cp	1
rs	10
c	0
b	0
f	0

3 Methods

Rating	Name	Size	Complexity
A	SSK.sim_score()	41	1
A	SSK.__init__()	36	3
A	SSK.sim()	51	3

# -*- coding: utf-8 -*-

# Copyright 2019 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._ssk.

String subsequence kernel (SSK) similarity
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from ._token_distance import _TokenDistance
from ..tokenizer import QSkipgrams

__all__ = ['SSK']


class SSK(_TokenDistance):
    r"""String subsequence kernel (SSK) similarity.

    This is based on :cite:`Lodhi:2002`.


    .. versionadded:: 0.4.1
    """

    def __init__(self, tokenizer=None, ssk_lambda=0.9, **kwargs):
        """Initialize SSK instance.

        Parameters
        ----------
        tokenizer : _Tokenizer
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
        ssk_lambda : float or Iterable
            A value in the range (0.0, 1.0) used for discouting gaps between
            characters according to the method described in :cite:`Lodhi:2002`.
            To supply multiple values of lambda, provide an Iterable of numeric
            values, such as (0.5, 0.05) or np.arange(0.05, 0.5, 0.05)
        **kwargs
            Arbitrary keyword arguments

        Other Parameters
        ----------------
        qval : int
            The length of each q-skipgram. Using this parameter and
            tokenizer=None will cause the instance to use the QGramskipgrams
            tokenizer with this q value.


        .. versionadded:: 0.4.1

        """
        super(SSK, self).__init__(
            tokenizer=tokenizer, ssk_lambda=ssk_lambda, **kwargs
        )

        qval = 2 if 'qval' not in self.params else self.params['qval']
        self.params['tokenizer'] = (
            tokenizer
            if tokenizer is not None
            else QSkipgrams(
                qval=qval, start_stop='', scaler='SSK', ssk_lambda=ssk_lambda
            )
        )

    def sim_score(self, src, tar):
        """Return the SSK similarity of two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            String subsequence kernel similarity

        Examples
        --------
        >>> cmp = SSK()
        >>> cmp.dist_abs('cat', 'hat')
        0.6441281138790036
        >>> cmp.dist_abs('Niall', 'Neil')
        0.5290992177869402
        >>> cmp.dist_abs('aluminum', 'Catalan')
        0.862398428061774
        >>> cmp.dist_abs('ATCG', 'TAGC')
        0.38591004719395017


        .. versionadded:: 0.4.1

        """
        self._tokenize(src, tar)

        src_wts = self._src_tokens
        tar_wts = self._tar_tokens

        score = sum(
            src_wts[token] * tar_wts[token] for token in src_wts & tar_wts
        )

        return score

    def sim(self, src, tar):
        """Return the normalized SSK similarity of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison

        Returns
        -------
        float
            Normalized string subsequence kernel similarity

        Examples
        --------
        >>> cmp = SSK()
        >>> cmp.sim('cat', 'hat')
        0.3558718861209964
        >>> cmp.sim('Niall', 'Neil')
        0.4709007822130597
        >>> cmp.sim('aluminum', 'Catalan')
        0.13760157193822603
        >>> cmp.sim('ATCG', 'TAGC')
        0.6140899528060498


        .. versionadded:: 0.4.1

        """
        if src == tar:
            return 1.0

        self._tokenize(src, tar)

        src_wts = self._src_tokens
        tar_wts = self._tar_tokens

        score = sum(
            src_wts[token] * tar_wts[token] for token in src_wts & tar_wts
        )

        norm = (
            sum(src_wts[token] * src_wts[token] for token in src_wts)
            * sum(tar_wts[token] * tar_wts[token] for token in tar_wts)
        ) ** 0.5

        if not score:
            return 0.0
        return score / norm


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2019 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.distance._ssk.
20
21		String subsequence kernel (SSK) similarity
22		"""
23
24	1	from __future__ import (
25		absolute_import,
26		division,
27		print_function,
28		unicode_literals,
29		)
30
31	1	from ._token_distance import _TokenDistance
32	1	from ..tokenizer import QSkipgrams
33
34	1	__all__ = ['SSK']
35
36
37	1	class SSK(_TokenDistance):
38		r"""String subsequence kernel (SSK) similarity.
39
40		This is based on :cite:`Lodhi:2002`.
41
42
43		.. versionadded:: 0.4.1
44		"""
45
46	1	def __init__(self, tokenizer=None, ssk_lambda=0.9, **kwargs):
47		"""Initialize SSK instance.
48
49		Parameters
50		----------
51		tokenizer : _Tokenizer
52		A tokenizer instance from the :py:mod:`abydos.tokenizer` package
53		ssk_lambda : float or Iterable
54		A value in the range (0.0, 1.0) used for discouting gaps between
55		characters according to the method described in :cite:`Lodhi:2002`.
56		To supply multiple values of lambda, provide an Iterable of numeric
57		values, such as (0.5, 0.05) or np.arange(0.05, 0.5, 0.05)
58		**kwargs
59		Arbitrary keyword arguments
60
61		Other Parameters
62		----------------
63		qval : int
64		The length of each q-skipgram. Using this parameter and
65		tokenizer=None will cause the instance to use the QGramskipgrams
66		tokenizer with this q value.
67
68
69		.. versionadded:: 0.4.1
70
71		"""
72	1	super(SSK, self).__init__(
73		tokenizer=tokenizer, ssk_lambda=ssk_lambda, **kwargs
74		)
75
76	1	qval = 2 if 'qval' not in self.params else self.params['qval']
77	1	self.params['tokenizer'] = (
78		tokenizer
79		if tokenizer is not None
80		else QSkipgrams(
81		qval=qval, start_stop='', scaler='SSK', ssk_lambda=ssk_lambda
82		)
83		)
84
85	1	def sim_score(self, src, tar):
86		"""Return the SSK similarity of two strings.
87
88		Parameters
89		----------
90		src : str
91		Source string for comparison
92		tar : str
93		Target string for comparison
94
95		Returns
96		-------
97		float
98		String subsequence kernel similarity
99
100		Examples
101		--------
102		>>> cmp = SSK()
103		>>> cmp.dist_abs('cat', 'hat')
104		0.6441281138790036
105		>>> cmp.dist_abs('Niall', 'Neil')
106		0.5290992177869402
107		>>> cmp.dist_abs('aluminum', 'Catalan')
108		0.862398428061774
109		>>> cmp.dist_abs('ATCG', 'TAGC')
110		0.38591004719395017
111
112
113		.. versionadded:: 0.4.1
114
115		"""
116	1	self._tokenize(src, tar)
117
118	1	src_wts = self._src_tokens
119	1	tar_wts = self._tar_tokens
120
121	1	score = sum(
122		src_wts[token] * tar_wts[token] for token in src_wts & tar_wts
123		)
124
125	1	return score
126
127	1	def sim(self, src, tar):
128		"""Return the normalized SSK similarity of two strings.
129
130		Parameters
131		----------
132		src : str
133		Source string (or QGrams/Counter objects) for comparison
134		tar : str
135		Target string (or QGrams/Counter objects) for comparison
136
137		Returns
138		-------
139		float
140		Normalized string subsequence kernel similarity
141
142		Examples
143		--------
144		>>> cmp = SSK()
145		>>> cmp.sim('cat', 'hat')
146		0.3558718861209964
147		>>> cmp.sim('Niall', 'Neil')
148		0.4709007822130597
149		>>> cmp.sim('aluminum', 'Catalan')
150		0.13760157193822603
151		>>> cmp.sim('ATCG', 'TAGC')
152		0.6140899528060498
153
154
155		.. versionadded:: 0.4.1
156
157		"""
158	1	if src == tar:
159	1	return 1.0
160
161	1	self._tokenize(src, tar)
162
163	1	src_wts = self._src_tokens
164	1	tar_wts = self._tar_tokens
165
166	1	score = sum(
167		src_wts[token] * tar_wts[token] for token in src_wts & tar_wts
168		)
169
170	1	norm = (
171		sum(src_wts[token] * src_wts[token] for token in src_wts)
172		* sum(tar_wts[token] * tar_wts[token] for token in tar_wts)
173		) ** 0.5
174
175	1	if not score:
176	1	return 0.0
177	1	return score / norm
178
179
180		if __name__ == '__main__':
181		import doctest
182
183		doctest.testmod()
184

chrislit / abydos

Push — master ( 643512...2b6b3e )

abydos.distance._ssk A

Complexity

Size/Duplication

Test Coverage

Importance

3 Methods

Duplication Side-by-Side

Filter issues like