Completed
Branch master (87ccc1)
by Chris
10:18
created

tests.distance.test_distance_sift4   A

Complexity

Total Complexity 4

Size/Duplication

Total Lines 243
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 156
dl 0
loc 243
rs 10
c 0
b 0
f 0
wmc 4
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.tests.test_distance.sift4.
20
21
This module contains unit tests for abydos.distance.sift4
22
"""
23
24
from __future__ import division, unicode_literals
25
26
import unittest
27
28
from abydos.distance.sift4 import dist_sift4, sift4_common, sift4_simplest, \
29
    sim_sift4
30
31
32
class Sift4TestCases(unittest.TestCase):
33
    """Test Sift4 functions.
34
35
    abydos.distance.sift4_simplest, sift4_common, sim_sift4, & sim_sift4
36
    """
37
38
    def test_sift4_simplest(self):
39
        """Test abydos.distance.sift4_simplest."""
40
        # tests copied from Lukas Benedix's post at
41
        # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
42
        self.assertEqual(sift4_simplest('', ''), 0)
43
        self.assertEqual(sift4_simplest('a', ''), 1)
44
        self.assertEqual(sift4_simplest('', 'a'), 1)
45
        self.assertEqual(sift4_simplest('abc', ''), 3)
46
        self.assertEqual(sift4_simplest('', 'abc'), 3)
47
48
        self.assertEqual(sift4_simplest('a', 'a'), 0)
49
        self.assertEqual(sift4_simplest('abc', 'abc'), 0)
50
51
        self.assertEqual(sift4_simplest('a', 'ab'), 1)
52
        self.assertEqual(sift4_simplest('ac', 'abc'), 1)
53
        self.assertEqual(sift4_simplest('abcdefg', 'xabxcdxxefxgx'), 10)
54
55
        self.assertEqual(sift4_simplest('ab', 'b'), 1)
56
        self.assertEqual(sift4_simplest('ab', 'a'), 1)
57
        self.assertEqual(sift4_simplest('abc', 'ac'), 1)
58
        self.assertEqual(sift4_simplest('xabxcdxxefxgx', 'abcdefg'), 10)
59
60
        self.assertEqual(sift4_simplest('a', 'b'), 1)
61
        self.assertEqual(sift4_simplest('ab', 'ac'), 1)
62
        self.assertEqual(sift4_simplest('ac', 'bc'), 1)
63
        self.assertEqual(sift4_simplest('abc', 'axc'), 1)
64
        self.assertEqual(sift4_simplest('xabxcdxxefxgx', '1ab2cd34ef5g6'), 6)
65
66
        self.assertEqual(sift4_simplest('example', 'samples'), 2)
67
        self.assertEqual(sift4_simplest('sturgeon', 'urgently'), 4)
68
        self.assertEqual(sift4_simplest('levenshtein', 'frankenstein'), 10)
69
        self.assertEqual(sift4_simplest('distance', 'difference'), 7)
70
71
        # Tests copied from
72
        # https://github.com/tdebatty/java-string-similarity/blob/master/src/test/java/info/debatty/java/stringsimilarity/experimental/Sift4Test.java
73
        self.assertEqual(sift4_simplest('This is the first string',
74
                                        'And this is another string', 5), 13)
75
        self.assertEqual(sift4_simplest('Lorem ipsum dolor sit amet, ' +
76
                                        'consectetur adipiscing elit.',
77
                                        'Amet Lorm ispum dolor sit amet, ' +
78
                                        'consetetur adixxxpiscing elit.',
79
                                        10), 20)
80
81
    def test_sift4_common(self):
82
        """Test abydos.distance.sift4_common."""
83
        # tests copied from Lukas Benedix's post at
84
        # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
85
        self.assertEqual(sift4_common('', ''), 0)
86
        self.assertEqual(sift4_common('a', ''), 1)
87
        self.assertEqual(sift4_common('', 'a'), 1)
88
        self.assertEqual(sift4_common('abc', ''), 3)
89
        self.assertEqual(sift4_common('', 'abc'), 3)
90
91
        self.assertEqual(sift4_common('a', 'a'), 0)
92
        self.assertEqual(sift4_common('abc', 'abc'), 0)
93
94
        self.assertEqual(sift4_common('a', 'ab'), 1)
95
        self.assertEqual(sift4_common('ac', 'abc'), 1)
96
        self.assertEqual(sift4_common('abcdefg', 'xabxcdxxefxgx'), 7)
97
98
        self.assertEqual(sift4_common('ab', 'b'), 1)
99
        self.assertEqual(sift4_common('ab', 'a'), 1)
100
        self.assertEqual(sift4_common('abc', 'ac'), 1)
101
        self.assertEqual(sift4_common('xabxcdxxefxgx', 'abcdefg'), 7)
102
103
        self.assertEqual(sift4_common('a', 'b'), 1)
104
        self.assertEqual(sift4_common('ab', 'ac'), 1)
105
        self.assertEqual(sift4_common('ac', 'bc'), 1)
106
        self.assertEqual(sift4_common('abc', 'axc'), 1)
107
        self.assertEqual(sift4_common('xabxcdxxefxgx', '1ab2cd34ef5g6'), 6)
108
109
        self.assertEqual(sift4_common('example', 'samples'), 2)
110
        self.assertEqual(sift4_common('sturgeon', 'urgently'), 3)
111
        self.assertEqual(sift4_common('levenshtein', 'frankenstein'), 6)
112
        self.assertEqual(sift4_common('distance', 'difference'), 5)
113
114
        # Tests copied from
115
        # https://github.com/tdebatty/java-string-similarity/blob/master/src/test/java/info/debatty/java/stringsimilarity/experimental/Sift4Test.java
116
        self.assertEqual(sift4_common('This is the first string',
117
                                      'And this is another string', 5), 11)
118
        self.assertEqual(sift4_common('Lorem ipsum dolor sit amet, ' +
119
                                      'consectetur adipiscing elit.',
120
                                      'Amet Lorm ispum dolor sit amet, ' +
121
                                      'consetetur adixxxpiscing elit.',
122
                                      10), 12)
123
124
        # cases with max_distance
125
        self.assertEqual(sift4_common('example', 'samples', 5, 5), 5)
126
        self.assertEqual(sift4_common('sturgeon', 'urgently', 5, 5), 5)
127
        self.assertEqual(sift4_common('levenshtein', 'frankenstein', 5, 5), 5)
128
        self.assertEqual(sift4_common('distance', 'difference', 5, 5), 5)
129
130
    def test_dist_sift4(self):
131
        """Test abydos.distance.dist_sift4."""
132
        # tests copied from Lukas Benedix's post at
133
        # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
134
        self.assertEqual(dist_sift4('', ''), 0)
135
        self.assertEqual(dist_sift4('a', ''), 1)
136
        self.assertEqual(dist_sift4('', 'a'), 1)
137
        self.assertEqual(dist_sift4('abc', ''), 1)
138
        self.assertEqual(dist_sift4('', 'abc'), 1)
139
140
        self.assertEqual(dist_sift4('a', 'a'), 0)
141
        self.assertEqual(dist_sift4('abc', 'abc'), 0)
142
143
        self.assertEqual(dist_sift4('a', 'ab'), 0.5)
144
        self.assertEqual(dist_sift4('ac', 'abc'), 1/3)
145
        self.assertAlmostEqual(dist_sift4('abcdefg', 'xabxcdxxefxgx'),
146
                               0.538461538)
147
148
        self.assertEqual(dist_sift4('ab', 'b'), 0.5)
149
        self.assertEqual(dist_sift4('ab', 'a'), 0.5)
150
        self.assertEqual(dist_sift4('abc', 'ac'), 1/3)
151
        self.assertAlmostEqual(dist_sift4('xabxcdxxefxgx', 'abcdefg'),
152
                               0.538461538)
153
154
        self.assertEqual(dist_sift4('a', 'b'), 1)
155
        self.assertEqual(dist_sift4('ab', 'ac'), 0.5)
156
        self.assertEqual(dist_sift4('ac', 'bc'), 0.5)
157
        self.assertEqual(dist_sift4('abc', 'axc'), 1/3)
158
        self.assertAlmostEqual(dist_sift4('xabxcdxxefxgx', '1ab2cd34ef5g6'),
159
                               0.461538461)
160
161
        self.assertAlmostEqual(dist_sift4('example', 'samples'), 0.285714285)
162
        self.assertAlmostEqual(dist_sift4('sturgeon', 'urgently'), 0.375)
163
        self.assertAlmostEqual(dist_sift4('levenshtein', 'frankenstein'), 0.5)
164
        self.assertAlmostEqual(dist_sift4('distance', 'difference'), 0.5)
165
166
        # Tests copied from
167
        # https://github.com/tdebatty/java-string-similarity/blob/master/src/test/java/info/debatty/java/stringsimilarity/experimental/Sift4Test.java
168
        self.assertAlmostEqual(dist_sift4('This is the first string',
169
                                          'And this is another string',
170
                                          5), 0.423076923)
171
        self.assertAlmostEqual(dist_sift4('Lorem ipsum dolor sit amet, ' +
172
                                          'consectetur adipiscing elit.',
173
                                          'Amet Lorm ispum dolor sit amet, ' +
174
                                          'consetetur adixxxpiscing elit.',
175
                                          10), 0.193548387)
176
177
        # cases with max_distance
178
        self.assertAlmostEqual(dist_sift4('example', 'samples', 5, 5),
179
                               0.714285714)
180
        self.assertAlmostEqual(dist_sift4('sturgeon', 'urgently', 5, 5), 0.625)
181
        self.assertAlmostEqual(dist_sift4('levenshtein', 'frankenstein', 5, 5),
182
                               0.416666666)
183
        self.assertAlmostEqual(dist_sift4('distance', 'difference', 5, 5), 0.5)
184
185
    def test_sim_sift4(self):
186
        """Test abydos.distance.sim_sift4."""
187
        # tests copied from Lukas Benedix's post at
188
        # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
189
        self.assertEqual(sim_sift4('', ''), 1)
190
        self.assertEqual(sim_sift4('a', ''), 0)
191
        self.assertEqual(sim_sift4('', 'a'), 0)
192
        self.assertEqual(sim_sift4('abc', ''), 0)
193
        self.assertEqual(sim_sift4('', 'abc'), 0)
194
195
        self.assertEqual(sim_sift4('a', 'a'), 1)
196
        self.assertEqual(sim_sift4('abc', 'abc'), 1)
197
198
        self.assertEqual(sim_sift4('a', 'ab'), 0.5)
199
        self.assertAlmostEqual(sim_sift4('ac', 'abc'), 2/3)
200
        self.assertAlmostEqual(sim_sift4('abcdefg', 'xabxcdxxefxgx'),
201
                               0.461538461)
202
203
        self.assertEqual(sim_sift4('ab', 'b'), 0.5)
204
        self.assertEqual(sim_sift4('ab', 'a'), 0.5)
205
        self.assertAlmostEqual(sim_sift4('abc', 'ac'), 2/3)
206
        self.assertAlmostEqual(sim_sift4('xabxcdxxefxgx', 'abcdefg'),
207
                               0.461538461)
208
209
        self.assertEqual(sim_sift4('a', 'b'), 0)
210
        self.assertEqual(sim_sift4('ab', 'ac'), 0.5)
211
        self.assertEqual(sim_sift4('ac', 'bc'), 0.5)
212
        self.assertAlmostEqual(sim_sift4('abc', 'axc'), 2/3)
213
        self.assertAlmostEqual(sim_sift4('xabxcdxxefxgx', '1ab2cd34ef5g6'),
214
                               0.538461538)
215
216
        self.assertAlmostEqual(sim_sift4('example', 'samples'), 0.714285714)
217
        self.assertAlmostEqual(sim_sift4('sturgeon', 'urgently'), 0.625)
218
        self.assertAlmostEqual(sim_sift4('levenshtein', 'frankenstein'), 0.5)
219
        self.assertAlmostEqual(sim_sift4('distance', 'difference'), 0.5)
220
221
        # Tests copied from
222
        # https://github.com/tdebatty/java-string-similarity/blob/master/src/test/java/info/debatty/java/stringsimilarity/experimental/Sift4Test.java
223
        self.assertAlmostEqual(sim_sift4('This is the first string',
224
                                         'And this is another string',
225
                                         5), 0.576923077)
226
        self.assertAlmostEqual(sim_sift4('Lorem ipsum dolor sit amet, ' +
227
                                         'consectetur adipiscing elit.',
228
                                         'Amet Lorm ispum dolor sit amet, ' +
229
                                         'consetetur adixxxpiscing elit.',
230
                                         10), 0.806451613)
231
232
        # cases with max_distance
233
        self.assertAlmostEqual(sim_sift4('example', 'samples', 5, 5),
234
                               0.285714286)
235
        self.assertAlmostEqual(sim_sift4('sturgeon', 'urgently', 5, 5), 0.375)
236
        self.assertAlmostEqual(sim_sift4('levenshtein', 'frankenstein', 5, 5),
237
                               0.583333333)
238
        self.assertAlmostEqual(sim_sift4('distance', 'difference', 5, 5), 0.5)
239
240
241
if __name__ == '__main__':
242
    unittest.main()
243