abydos.distance._isg.ISG.sim()   A
last analyzed

Complexity

Conditions 5

Size

Total Lines 38
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 6
CRAP Score 5

Importance

Changes 0
Metric Value
eloc 8
dl 0
loc 38
ccs 6
cts 6
cp 1
rs 9.3333
c 0
b 0
f 0
cc 5
nop 3
crap 5
1
# Copyright 2019-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.distance._isg.
18
19 1
Bouchard & Pouyez's Indice de Similitude-Guth (ISG)
20
"""
21
22
from typing import Any
23
24 1
from ._distance import _Distance
25
26
__all__ = ['ISG']
27
28
29
class ISG(_Distance):
30
    """Indice de Similitude-Guth (ISG) similarity.
31 1
32
    This is an implementation of Bouchard & Pouyez's Indice de Similitude-Guth
33 1
    (ISG) :cite:`Bouchard:1980`. At its heart, ISG is Jaccard similarity, but
34
    limits on token matching are added according to part of Guth's matching
35
    criteria :cite:`Guth:1976`.
36 1
37
    :cite:`Bouchard:1980` is limited in its implementation details. Based on
38
    the examples given in the paper, it appears that only the first 4 of Guth's
39
    rules are considered (a letter in the first string must match a letter in
40
    the second string appearing in the same position, an adjacent position, or
41
    two positions ahead). It also appears that the distance in the paper is
42
    the greater of the distance from string 1 to string 2 and the distance
43
    from string 2 to string 1.
44
45
    These qualities can be specified as parameters. At initialization, specify
46
    ``full_guth=True`` to apply all of Guth's rules and ``symmetric=False`` to
47
    calculate only the distance from string 1 to string 2.
48
49
    .. versionadded:: 0.4.1
50
    """
51
52
    def __init__(
53
        self, full_guth: bool = False, symmetric: bool = True, **kwargs: Any
54
    ) -> None:
55
        """Initialize ISG instance.
56
57
        Parameters
58
        ----------
59 1
        full_guth : bool
60
            Whether to apply all of Guth's matching rules
61
        symmetric : bool
62
            Whether to calculate the symmetric distance
63
        **kwargs
64
            Arbitrary keyword arguments
65
66
67
        .. versionadded:: 0.4.1
68
69
        """
70
        super(ISG, self).__init__(**kwargs)
71
        self._full_guth = full_guth
72
        self._symmetric = symmetric
73
74
    def _isg_i(self, src: str, tar: str) -> float:
75 1
        """Return an individual ISG similarity (not symmetric) for src to tar.
76 1
77 1
        Parameters
78
        ----------
79 1
        src : str
80
            Source string for comparison
81
        tar : str
82
            Target string for comparison
83
84
        Returns
85
        -------
86
        float
87
            The ISG similarity
88
89
90
        .. versionadded:: 0.4.1
91
92
        """
93
94
        def _char_at(name: str, pos: int) -> str:
95
            if pos >= len(name):
96
                return ''
97
            return name[pos]
98
99 1
        matches = 0
100 1
        for pos in range(len(src)):
101 1
            s = _char_at(src, pos)
102 1
            if s and s in set(tar[max(0, pos - 1) : pos + 3]):
103
                matches += 1
104 1
                continue
105 1
106 1
            if self._full_guth:
107 1
                t = _char_at(tar, pos)
108 1
                if t and t in set(src[max(0, pos - 1) : pos + 3]):
109 1
                    matches += 1
110 1
                    continue
111
112 1
                s = _char_at(src, pos + 1)
113 1
                t = _char_at(tar, pos + 1)
114 1
                if s and t and s == t:
115 1
                    matches += 1
116 1
                    continue
117 1
118
                s = _char_at(src, pos + 2)
119 1
                t = _char_at(tar, pos + 2)
120 1
                if s and t and s == t:
121 1
                    matches += 1
122 1
                    continue
123 1
124
        return matches / (len(src) + len(tar) - matches)
125 1
126 1
    def sim(self, src: str, tar: str) -> float:
127 1
        """Return the Indice de Similitude-Guth (ISG) similarity of two words.
128 1
129 1
        Parameters
130
        ----------
131 1
        src : str
132
            Source string for comparison
133 1
        tar : str
134
            Target string for comparison
135
136
        Returns
137
        -------
138
        float
139
            The ISG similarity
140
141
        Examples
142
        --------
143
        >>> cmp = ISG()
144
        >>> cmp.sim('cat', 'hat')
145
        0.5
146
        >>> cmp.sim('Niall', 'Neil')
147
        0.5
148
        >>> cmp.sim('aluminum', 'Catalan')
149
        0.15384615384615385
150
        >>> cmp.sim('ATCG', 'TAGC')
151
        1.0
152
153
154
        .. versionadded:: 0.4.1
155
156
        """
157
        if src == tar:
158
            return 1.0
159
        if len(src) > len(tar):
160
            src, tar = tar, src
161
        elif self._symmetric and len(src) == len(tar):
162
            return max(self._isg_i(src, tar), self._isg_i(tar, src))
163
        return self._isg_i(src, tar)
164 1
165 1
166 1
if __name__ == '__main__':
167 1
    import doctest
168 1
169
    doctest.testmod()
170