Completed
Push — master ( 643512...2b6b3e )
by Chris
20:40 queued 10:36
created

abydos.distance._ssk   A

Complexity

Total Complexity 7

Size/Duplication

Total Lines 184
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
wmc 7
eloc 44
dl 0
loc 184
ccs 27
cts 27
cp 1
rs 10
c 0
b 0
f 0

3 Methods

Rating   Name   Duplication   Size   Complexity  
A SSK.sim_score() 0 41 1
A SSK.__init__() 0 36 3
A SSK.sim() 0 51 3
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2019 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._ssk.
20
21
String subsequence kernel (SSK) similarity
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from ._token_distance import _TokenDistance
32 1
from ..tokenizer import QSkipgrams
33
34 1
__all__ = ['SSK']
35
36
37 1
class SSK(_TokenDistance):
38
    r"""String subsequence kernel (SSK) similarity.
39
40
    This is based on :cite:`Lodhi:2002`.
41
42
43
    .. versionadded:: 0.4.1
44
    """
45
46 1
    def __init__(self, tokenizer=None, ssk_lambda=0.9, **kwargs):
47
        """Initialize SSK instance.
48
49
        Parameters
50
        ----------
51
        tokenizer : _Tokenizer
52
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
53
        ssk_lambda : float or Iterable
54
            A value in the range (0.0, 1.0) used for discouting gaps between
55
            characters according to the method described in :cite:`Lodhi:2002`.
56
            To supply multiple values of lambda, provide an Iterable of numeric
57
            values, such as (0.5, 0.05) or np.arange(0.05, 0.5, 0.05)
58
        **kwargs
59
            Arbitrary keyword arguments
60
61
        Other Parameters
62
        ----------------
63
        qval : int
64
            The length of each q-skipgram. Using this parameter and
65
            tokenizer=None will cause the instance to use the QGramskipgrams
66
            tokenizer with this q value.
67
68
69
        .. versionadded:: 0.4.1
70
71
        """
72 1
        super(SSK, self).__init__(
73
            tokenizer=tokenizer, ssk_lambda=ssk_lambda, **kwargs
74
        )
75
76 1
        qval = 2 if 'qval' not in self.params else self.params['qval']
77 1
        self.params['tokenizer'] = (
78
            tokenizer
79
            if tokenizer is not None
80
            else QSkipgrams(
81
                qval=qval, start_stop='', scaler='SSK', ssk_lambda=ssk_lambda
82
            )
83
        )
84
85 1
    def sim_score(self, src, tar):
86
        """Return the SSK similarity of two strings.
87
88
        Parameters
89
        ----------
90
        src : str
91
            Source string for comparison
92
        tar : str
93
            Target string for comparison
94
95
        Returns
96
        -------
97
        float
98
            String subsequence kernel similarity
99
100
        Examples
101
        --------
102
        >>> cmp = SSK()
103
        >>> cmp.dist_abs('cat', 'hat')
104
        0.6441281138790036
105
        >>> cmp.dist_abs('Niall', 'Neil')
106
        0.5290992177869402
107
        >>> cmp.dist_abs('aluminum', 'Catalan')
108
        0.862398428061774
109
        >>> cmp.dist_abs('ATCG', 'TAGC')
110
        0.38591004719395017
111
112
113
        .. versionadded:: 0.4.1
114
115
        """
116 1
        self._tokenize(src, tar)
117
118 1
        src_wts = self._src_tokens
119 1
        tar_wts = self._tar_tokens
120
121 1
        score = sum(
122
            src_wts[token] * tar_wts[token] for token in src_wts & tar_wts
123
        )
124
125 1
        return score
126
127 1
    def sim(self, src, tar):
128
        """Return the normalized SSK similarity of two strings.
129
130
        Parameters
131
        ----------
132
        src : str
133
            Source string (or QGrams/Counter objects) for comparison
134
        tar : str
135
            Target string (or QGrams/Counter objects) for comparison
136
137
        Returns
138
        -------
139
        float
140
            Normalized string subsequence kernel similarity
141
142
        Examples
143
        --------
144
        >>> cmp = SSK()
145
        >>> cmp.sim('cat', 'hat')
146
        0.3558718861209964
147
        >>> cmp.sim('Niall', 'Neil')
148
        0.4709007822130597
149
        >>> cmp.sim('aluminum', 'Catalan')
150
        0.13760157193822603
151
        >>> cmp.sim('ATCG', 'TAGC')
152
        0.6140899528060498
153
154
155
        .. versionadded:: 0.4.1
156
157
        """
158 1
        if src == tar:
159 1
            return 1.0
160
161 1
        self._tokenize(src, tar)
162
163 1
        src_wts = self._src_tokens
164 1
        tar_wts = self._tar_tokens
165
166 1
        score = sum(
167
            src_wts[token] * tar_wts[token] for token in src_wts & tar_wts
168
        )
169
170 1
        norm = (
171
            sum(src_wts[token] * src_wts[token] for token in src_wts)
172
            * sum(tar_wts[token] * tar_wts[token] for token in tar_wts)
173
        ) ** 0.5
174
175 1
        if not score:
176 1
            return 0.0
177 1
        return score / norm
178
179
180
if __name__ == '__main__':
181
    import doctest
182
183
    doctest.testmod()
184