Completed
Branch master (87ccc1)
by Chris
08:42
created

tests.fuzz.fuzz_test_phonetic   B

Complexity

Total Complexity 52

Size/Duplication

Total Lines 335
Duplicated Lines 30.45 %

Importance

Changes 0
Metric Value
eloc 257
dl 102
loc 335
rs 7.44
c 0
b 0
f 0
wmc 52

How to fix   Duplicated Code    Complexity   

Duplicated Code

Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.

Common duplication problems, and corresponding solutions are:

Complexity

 Tip:   Before tackling complexity, make sure that you eliminate any duplication first. This often can reduce the size of classes significantly.

Complex classes like tests.fuzz.fuzz_test_phonetic often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.tests.fuzz.test_phonetic.
20
21
This module contains fuzz tests for abydos.phonetic
22
"""
23
24
import codecs
25
import unittest
26
from random import choice, randint, sample
27
28
from abydos.phonetic.alpha_sis import alpha_sis
29
from abydos.phonetic.bmpm import bmpm
30
from abydos.phonetic.caverphone import caverphone
31
from abydos.phonetic.davidson import davidson
32
from abydos.phonetic.de import haase_phonetik, koelner_phonetik, \
33
    koelner_phonetik_alpha, koelner_phonetik_num_to_alpha, phonem, \
34
    reth_schek_phonetik
35
from abydos.phonetic.dm import dm_soundex
36
from abydos.phonetic.dolby import dolby
37
from abydos.phonetic.es import phonetic_spanish, spanish_metaphone
38
from abydos.phonetic.eudex import eudex
39
from abydos.phonetic.fr import fonem, henry_early
40
from abydos.phonetic.hybrid import metasoundex, onca
41
from abydos.phonetic.metaphone import double_metaphone, metaphone
42
from abydos.phonetic.mra import mra
43
from abydos.phonetic.nrl import nrl
44
from abydos.phonetic.nysiis import nysiis
45
from abydos.phonetic.parmar_kumbharana import parmar_kumbharana
46
from abydos.phonetic.phonet import phonet
47
from abydos.phonetic.pt import soundex_br
48
from abydos.phonetic.roger_root import roger_root
49
from abydos.phonetic.russell import russell_index, russell_index_alpha, \
50
    russell_index_num_to_alpha
51
from abydos.phonetic.sound_d import sound_d
52
from abydos.phonetic.soundex import fuzzy_soundex, lein, phonex, phonix, \
53
    pshp_soundex_first, pshp_soundex_last, refined_soundex, soundex
54
from abydos.phonetic.spfc import spfc
55
from abydos.phonetic.statistics_canada import statistics_canada
56
from abydos.phonetic.sv import norphone, sfinxbis
57
58
from . import EXTREME_TEST, _corpus_file, _fuzz, _random_char
59
60
algorithms = {'russell_index': lambda name: russell_index(name),
61
              'russell_index_num_to_alpha':
62
                  lambda name: russell_index_num_to_alpha(russell_index(name)),
63
              'russell_index_alpha': russell_index_alpha,
64
              'soundex': soundex,
65
              'reverse_soundex': lambda name: soundex(name, reverse=True),
66
              'soundex_0pad_ml6':
67
                  lambda name: soundex(name, zero_pad=True, max_length=6),
68
              'soundex_special': lambda name: soundex(name, var='special'),
69
              'soundex_census': lambda name: soundex(name, var='Census'),
70
              'refined_soundex': refined_soundex,
71
              'refined_soundex_vowels':
72
                  lambda name: refined_soundex(name, retain_vowels=True),
73
              'refined_soundex_0pad_ml6':
74
                  lambda name:
75
                  refined_soundex(name, zero_pad=True, max_length=6),
76
              'dm_soundex': lambda name: dm_soundex(name),
77
              'koelner_phonetik': koelner_phonetik,
78
              'koelner_phonetik_num_to_alpha':
79
                  lambda name:
80
                  koelner_phonetik_num_to_alpha(koelner_phonetik(name)),
81
              'koelner_phonetik_alpha': koelner_phonetik_alpha,
82
              'nysiis': nysiis,
83
              'nysiis_modified': lambda name: nysiis(name, modified=True),
84
              'nysiis_ml_inf':
85
                  lambda name: nysiis(name, max_length=-1),
86
              'mra': mra,
87
              'metaphone': metaphone,
88
              'double_metaphone':
89
                  lambda name: double_metaphone(name),
90
              'caverphone_1': lambda name: caverphone(name, version=1),
91
              'caverphone_2': caverphone,
92
              'alpha_sis': lambda name: alpha_sis(name),
93
              'fuzzy_soundex': fuzzy_soundex,
94
              'fuzzy_soundex_0pad_ml8':
95
                  lambda name:
96
                  fuzzy_soundex(name, max_length=8, zero_pad=True),
97
              'phonex': phonex,
98
              'phonex_0pad_ml6':
99
                  lambda name: phonex(name, max_length=6, zero_pad=True),
100
              'phonem': phonem,
101
              'phonix': phonix,
102
              'phonix_0pad_ml6':
103
                  lambda name: phonix(name, max_length=6, zero_pad=True),
104
              'sfinxbis': lambda name: sfinxbis(name),
105
              'sfinxbis_ml6': lambda name: sfinxbis(name, max_length=6),
106
              'phonet_1': phonet,
107
              'phonet_2': lambda name: phonet(name, mode=2),
108
              'phonet_1_none': lambda name: phonet(name, lang='none'),
109
              'phonet_2_none': lambda name: phonet(name, mode=2, lang='none'),
110
              'spfc': lambda name: spfc(' '.join((name, name))),
111
              'statistics_canada': statistics_canada,
112
              'statistics_canada_ml8':
113
                  lambda name: statistics_canada(name, max_length=8),
114
              'lein': lein,
115
              'lein_nopad_ml8':
116
                  lambda name: lein(name, max_length=8, zero_pad=False),
117
              'roger_root': roger_root,
118
              'roger_root_nopad_ml8':
119
                  lambda name: roger_root(name, max_length=8, zero_pad=False),
120
              'onca': onca,
121
              'onca_nopad_ml8':
122
                  lambda name: onca(name, max_length=8, zero_pad=False),
123
              'eudex': lambda name: eudex(name),
124
              'haase_phonetik': lambda name: haase_phonetik(name),
125
              'haase_phonetik_primary':
126
                  lambda name: haase_phonetik(name, primary_only=True)[:1],
127
              'reth_schek_phonetik': reth_schek_phonetik,
128
              'fonem': fonem,
129
              'parmar_kumbharana': parmar_kumbharana,
130
              'davidson': davidson,
131
              'sound_d': sound_d,
132
              'sound_d_ml8': lambda name: sound_d(name, max_length=8),
133
              'pshp_soundex_last': pshp_soundex_last,
134
              'pshp_soundex_last_german':
135
                  lambda name: pshp_soundex_last(name, german=True),
136
              'pshp_soundex_last_ml8':
137
                  lambda name: pshp_soundex_last(name, max_length=8),
138
              'pshp_soundex_first': pshp_soundex_first,
139
              'pshp_soundex_first_german':
140
                  lambda name: pshp_soundex_first(name, german=True),
141
              'pshp_soundex_first_ml8':
142
                  lambda name: pshp_soundex_first(name, max_length=8),
143
              'henry_early': henry_early,
144
              'henry_early_ml8': lambda name: henry_early(name, max_length=8),
145
              'norphone': norphone,
146
              'dolby': dolby,
147
              'dolby_ml4': lambda name: dolby(name, max_length=4),
148
              'dolby_vowels': lambda name: dolby(name, keep_vowels=True),
149
              'phonetic_spanish': phonetic_spanish,
150
              'phonetic_spanish_ml4':
151
                  lambda name: phonetic_spanish(name, max_length=4),
152
              'spanish_metaphone': spanish_metaphone,
153
              'spanish_metaphone_modified':
154
                  lambda name: spanish_metaphone(name, modified=True),
155
              'spanish_metaphone_ml4':
156
                  lambda name: spanish_metaphone(name, max_length=4),
157
              'metasoundex': metasoundex,
158
              'metasoundex_es': lambda name: metasoundex(name, lang='es'),
159
              'soundex_br': soundex_br,
160
              'nrl': nrl,
161
              'bmpm': bmpm,
162
              }
163
164
165
class BigListOfNaughtyStringsTestCases(unittest.TestCase):
166
    """Test each phonetic algorithm against the BLNS set.
167
168
    Here, we test each algorithm against each string, but we only care that it
169
    does not result in an exception.
170
171
    While not actually a fuzz test, this does serve the purpose of looking for
172
    errors resulting from unanticipated input.
173
    """
174
175
    def fuzz_test_blns(self):
176
        """Test each phonetic algorithm against the BLNS set."""
177
        blns = []
178
        omit_section = False
179
        with codecs.open(_corpus_file('blns.txt'), encoding='UTF-8') as nsf:
180
            for line in nsf:
181
                line = line[:-1]
182
                if 'Script Injection' in line:
183
                    omit_section = True
184
                if 'SQL Injection' in line:
185
                    omit_section = False
186
                if line and line[0] != '#':
187
                    bmpm_omit = omit_section | (len(line.split()) > 5)
188
                    blns.append((bmpm_omit, line))
189
190
        for algo in algorithms:
191
            for bmpm_omit, ns in blns:
192
                try:
193
                    if not (bmpm_omit and 'bmpm' in algo):
194
                        algorithms[algo](ns)
195
                except Exception as inst:
196
                    self.fail('Exception "{}" thrown by {} for BLNS: {}'
197
                              .format(inst, algo, ns))
198
199
200
class FuzzedWordsTestCases(unittest.TestCase):
201
    """Test each phonetic algorithm against the base words set."""
202
203
    reps = 1000 * (10000 if EXTREME_TEST else 1)
204
205
    basewords = []
206
    with codecs.open(_corpus_file('basewords.txt'),
207
                     encoding='UTF-8') as basewords_file:
208
        for line in basewords_file:
209
            line = line[:-1]
210
            if line:
211
                basewords.append(line)
212
213
    def fuzz_test_base(self):
214
        """Test each phonetic algorithm against the unfuzzed base words."""
215
        for algo in algorithms:
216
            for word in self.basewords:
217
                try:
218
                    if not ('bmpm' in algo and len(word) > 12):
219
                        algorithms[algo](word)
220
                except Exception as inst:
221
                    self.fail('Exception "{}" thrown by {} for word: {}'
222
                              .format(inst, algo, word))
223
224
    def fuzz_test_20pct(self):
225
        """Fuzz test phonetic algorithms against 20% fuzzed words."""
226
        for _ in range(self.reps):
227
            fuzzed = _fuzz(choice(self.basewords), fuzziness=0.2)  # noqa: S311
228
229
            if EXTREME_TEST:
230
                algs = list(algorithms.keys())
231
            else:
232
                algs = sample(list(algorithms.keys()), k=5)  # noqa: S311
233
234
            for algo in algs:
235
                try:
236
                    if not ('bmpm' in algo and len(fuzzed) > 12):
237
                        algorithms[algo](fuzzed)
238
                except Exception as inst:
239
                    self.fail('Exception "{}" thrown by {} for word: {}'
240
                              .format(inst, algo, fuzzed))
241
242
    def fuzz_test_100pct(self):
243
        """Fuzz test phonetic algorithms against 100% fuzzed words."""
244
        for _ in range(self.reps):
245
            fuzzed = _fuzz(choice(self.basewords), fuzziness=1)  # noqa: S311
246
247
            if EXTREME_TEST:
248
                algs = list(algorithms.keys())
249
            else:
250
                algs = sample(list(algorithms.keys()), k=5)  # noqa: S311
251
252
            for algo in algs:
253
                try:
254
                    if not ('bmpm' in algo and len(fuzzed) > 12):
255
                        algorithms[algo](fuzzed)
256
                except Exception as inst:
257
                    self.fail('Exception "{}" thrown by {} for word: {}'
258
                              .format(inst, algo, fuzzed))
259
260
    def fuzz_test_fuzz_bmp(self):
261
        """Fuzz test phonetic algorithms against BMP fuzz."""
262
        for _ in range(self.reps):
263
            fuzzed = ''.join(_random_char(0xffff) for _ in
264
                             range(0, randint(8, 16)))  # noqa: S311
265
266
            if EXTREME_TEST:
267
                algs = list(algorithms.keys())
268
            else:
269
                algs = sample(list(algorithms.keys()), k=5)  # noqa: S311
270
271
            for algo in algs:
272
                try:
273
                    algorithms[algo](fuzzed)
274
                except Exception as inst:
275
                    self.fail('Exception "{}" thrown by {} for word: {}'
276
                              .format(inst, algo, fuzzed))
277
278
    def fuzz_test_fuzz_bmpsmp_letter(self):
279
        """Fuzz test phonetic algorithms against alphabetic BMP+SMP fuzz."""
280
        for _ in range(self.reps):
281
            fuzzed = ''.join(_random_char(0x1ffff, ' LETTER ') for _ in
282
                             range(0, randint(8, 16)))  # noqa: S311
283
284
            if EXTREME_TEST:
285
                algs = list(algorithms.keys())
286
            else:
287
                algs = sample(list(algorithms.keys()), k=5)  # noqa: S311
288
289
            for algo in algs:
290
                try:
291
                    algorithms[algo](fuzzed)
292
                except Exception as inst:
293
                    self.fail('Exception "{}" thrown by {} for word: {}'
294
                              .format(inst, algo, fuzzed))
295
296
    def fuzz_test_fuzz_bmpsmp_latin(self):
297
        """Fuzz test phonetic algorithms against Latin BMP+SMP fuzz."""
298
        for _ in range(self.reps):
299
            fuzzed = ''.join(_random_char(0x1ffff, 'LATIN ') for _ in
300
                             range(0, randint(8, 16)))  # noqa: S311
301
302
            if EXTREME_TEST:
303
                algs = list(algorithms.keys())
304
            else:
305
                algs = sample(list(algorithms.keys()), k=5)  # noqa: S311
306
307
            for algo in algs:
308
                try:
309
                    algorithms[algo](fuzzed)
310
                except Exception as inst:
311
                    self.fail('Exception "{}" thrown by {} for word: {}'
312
                              .format(inst, algo, fuzzed))
313
314
    def fuzz_test_fuzz_unicode(self):
315
        """Fuzz test phonetic algorithms against valid Unicode fuzz."""
316
        for _ in range(self.reps):
317
            fuzzed = ''.join(_random_char() for _ in
318
                             range(0, randint(8, 16)))  # noqa: S311
319
320
            if EXTREME_TEST:
321
                algs = list(algorithms.keys())
322
            else:
323
                algs = sample(list(algorithms.keys()), k=5)  # noqa: S311
324
325
            for algo in algs:
326
                try:
327
                    algorithms[algo](fuzzed)
328
                except Exception as inst:
329
                    self.fail('Exception "{}" thrown by {} for word: {}'
330
                              .format(inst, algo, fuzzed))
331
332
333
if __name__ == '__main__':
334
    unittest.main()
335