Completed
Branch master (87ccc1)
by Chris
08:42
created

tests.phonetic.test_phonetic_bmpm.BeiderMorseTestCases.test_bmpm()   D

Complexity

Conditions 10

Size

Total Lines 213
Code Lines 167

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 10
eloc 167
nop 1
dl 0
loc 213
rs 4.1999
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like tests.phonetic.test_phonetic_bmpm.BeiderMorseTestCases.test_bmpm() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.tests.test_phonetic_bmpm.
20
21
This module contains unit tests for abydos.phonetic.bmpm
22
"""
23
24
from __future__ import unicode_literals
25
26
import codecs
27
import unittest
28
29
# noinspection PyProtectedMember
30
from abydos.phonetic._bmdata import L_ANY, L_CYRILLIC, L_CZECH, L_DUTCH, \
31
    L_ENGLISH, L_FRENCH, L_GERMAN, L_GREEK, L_GREEKLATIN, L_HEBREW, \
32
    L_HUNGARIAN, L_ITALIAN, L_LATVIAN, L_POLISH, L_PORTUGUESE, L_ROMANIAN, \
33
    L_SPANISH, L_TURKISH
34
# noinspection PyProtectedMember
35
from abydos.phonetic.bmpm import _bm_apply_rule_if_compat, \
36
    _bm_expand_alternates, _bm_language, _bm_normalize_lang_attrs, \
37
    _bm_phonetic_number, _bm_remove_dupes, bmpm
38
39
40
from six import text_type
41
42
from .. import ALLOW_RANDOM, _corpus_file, _one_in
43
44
45
class BeiderMorseTestCases(unittest.TestCase):
46
    """Test BMPM functions.
47
48
    test cases for abydos.phonetic.bmpm
49
    """
50
51
    def test_bmpm(self):
52
        """Test abydos.phonetic.bmpm.bmpm.
53
54
        Most test cases from:
55
        http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/
56
57
        As a rule, the test cases are copied from the above code, but the
58
        resultant values are not. This is largely because this Python port
59
        follows the PHP reference implementation much more closely than the
60
        Java port in Apache Commons Codec does. As a result, these tests have
61
        been conformed to the output produced by the PHP implementation,
62
        particularly in terms of formatting and ordering.
63
        """
64
        # base cases
65
        self.assertEqual(bmpm(''), '')
66
67
        for langs in ('', 1, 'spanish', 'english,italian', 3):
68
            for name_mode in ('gen', 'ash', 'sep'):
69
                for match_mode in ('approx', 'exact'):
70
                    for concat in (False, True):
71
                        if (isinstance(langs, text_type) and
72
                            ((name_mode == 'ash' and 'italian' in langs) or
73
                             (name_mode == 'sep' and 'english' in langs))):
74
                            self.assertRaises(ValueError, bmpm, '', langs,
75
                                              name_mode, match_mode, concat)
76
                        else:
77
                            self.assertEqual(bmpm('', langs, name_mode,
78
                                                  match_mode, concat), '')
79
80
        # testSolrGENERIC
81
        # concat is true, ruleType is EXACT
82
        self.assertEqual(bmpm('Angelo', '', 'gen', 'exact', True),
83
                         'angelo anxelo anhelo anjelo anZelo andZelo')
84
        self.assertEqual(bmpm('D\'Angelo', '', 'gen', 'exact', True),
85
                         'angelo anxelo anhelo anjelo anZelo andZelo dangelo' +
86
                         ' danxelo danhelo danjelo danZelo dandZelo')
87
        self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'gen',
88
                              'exact', True),
89
                         'angelo anxelo andZelo')
90
        self.assertEqual(bmpm('1234', '', 'gen', 'exact', True), '')
91
92
        # concat is false, ruleType is EXACT
93
        self.assertEqual(bmpm('Angelo', '', 'gen', 'exact', False),
94
                         'angelo anxelo anhelo anjelo anZelo andZelo')
95
        self.assertEqual(bmpm('D\'Angelo', '', 'gen', 'exact', False),
96
                         'angelo anxelo anhelo anjelo anZelo andZelo dangelo' +
97
                         ' danxelo danhelo danjelo danZelo dandZelo')
98
        self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'gen',
99
                              'exact', False),
100
                         'angelo anxelo andZelo')
101
        self.assertEqual(bmpm('1234', '', 'gen', 'exact', False), '')
102
103
        # concat is true, ruleType is APPROX
104
        self.assertEqual(bmpm('Angelo', '', 'gen', 'approx', True),
105
                         'angilo angYlo agilo ongilo ongYlo ogilo Yngilo' +
106
                         ' YngYlo anxilo onxilo anilo onilo aniilo oniilo' +
107
                         ' anzilo onzilo')
108
        self.assertEqual(bmpm('D\'Angelo', '', 'gen', 'approx', True),
109
                         'angilo angYlo agilo ongilo ongYlo ogilo Yngilo' +
110
                         ' YngYlo anxilo onxilo anilo onilo aniilo oniilo' +
111
                         ' anzilo onzilo dangilo dangYlo dagilo dongilo' +
112
                         ' dongYlo dogilo dYngilo dYngYlo danxilo donxilo' +
113
                         ' danilo donilo daniilo doniilo danzilo donzilo')
114
        self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'gen',
115
                              'approx', True),
116
                         'angilo ongilo anxilo onxilo anzilo onzilo')
117
        self.assertEqual(bmpm('1234', '', 'gen', 'approx', True), '')
118
119
        # concat is false, ruleType is APPROX
120
        self.assertEqual(bmpm('Angelo', '', 'gen', 'approx', False),
121
                         'angilo angYlo agilo ongilo ongYlo ogilo Yngilo' +
122
                         ' YngYlo anxilo onxilo anilo onilo aniilo oniilo' +
123
                         ' anzilo onzilo')
124
        self.assertEqual(bmpm('D\'Angelo', '', 'gen', 'approx', False),
125
                         'angilo angYlo agilo ongilo ongYlo ogilo Yngilo' +
126
                         ' YngYlo anxilo onxilo anilo onilo aniilo oniilo' +
127
                         ' anzilo onzilo dangilo dangYlo dagilo dongilo' +
128
                         ' dongYlo dogilo dYngilo dYngYlo danxilo donxilo' +
129
                         ' danilo donilo daniilo doniilo danzilo donzilo')
130
        self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'gen',
131
                              'approx', False),
132
                         'angilo ongilo anxilo onxilo anzilo onzilo')
133
        self.assertEqual(bmpm('1234', '', 'gen', 'approx', False), '')
134
135
        # testSolrASHKENAZI
136
        # concat is true, ruleType is EXACT
137
        self.assertEqual(bmpm('Angelo', '', 'ash', 'exact', True),
138
                         'angelo andZelo anhelo anxelo')
139
        self.assertEqual(bmpm('D\'Angelo', '', 'ash', 'exact', True),
140
                         'dangelo dandZelo danhelo danxelo')
141
        self.assertRaises(ValueError, bmpm, 'Angelo', 'italian,greek,spanish',
142
                          'ash', 'exact', True)
143
        self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'ash',
144
                              'exact', True, True), 'anxelo angelo')
145
        self.assertEqual(bmpm('1234', '', 'ash', 'exact', True), '')
146
147
        # concat is false, ruleType is EXACT
148
        self.assertEqual(bmpm('Angelo', '', 'ash', 'exact', False),
149
                         'angelo andZelo anhelo anxelo')
150
        self.assertEqual(bmpm('D\'Angelo', '', 'ash', 'exact', False),
151
                         'dangelo dandZelo danhelo danxelo')
152
        self.assertRaises(ValueError, bmpm, 'Angelo', 'italian,greek,spanish',
153
                          'ash', 'exact', False)
154
        self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'ash',
155
                              'exact', False, True), 'anxelo angelo')
156
        self.assertEqual(bmpm('1234', '', 'ash', 'exact', False), '')
157
158
        # concat is true, ruleType is APPROX
159
        self.assertEqual(bmpm('Angelo', '', 'ash', 'approx', True),
160
                         'angilo angYlo ongilo ongYlo Yngilo YngYlo anzilo' +
161
                         ' onzilo anilo onilo anxilo onxilo')
162
        self.assertEqual(bmpm('D\'Angelo', '', 'ash', 'approx', True),
163
                         'dangilo dangYlo dongilo dongYlo dYngilo dYngYlo' +
164
                         ' danzilo donzilo danilo donilo danxilo donxilo')
165
        self.assertRaises(ValueError, bmpm, 'Angelo', 'italian,greek,spanish',
166
                          'ash', 'approx', True)
167
        self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'ash',
168
                              'approx', True, True),
169
                         'anxYlo anxilo onxYlo onxilo angYlo angilo ongYlo' +
170
                         ' ongilo')
171
        self.assertEqual(bmpm('1234', '', 'ash', 'approx', True), '')
172
173
        # concat is false, ruleType is APPROX
174
        self.assertEqual(bmpm('Angelo', '', 'ash', 'approx', False),
175
                         'angilo angYlo ongilo ongYlo Yngilo YngYlo anzilo' +
176
                         ' onzilo anilo onilo anxilo onxilo')
177
        self.assertEqual(bmpm('D\'Angelo', '', 'ash', 'approx', False),
178
                         'dangilo dangYlo dongilo dongYlo dYngilo dYngYlo' +
179
                         ' danzilo donzilo danilo donilo danxilo donxilo')
180
        self.assertRaises(ValueError, bmpm, 'Angelo', 'italian,greek,spanish',
181
                          'ash', 'approx', False)
182
        self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'ash',
183
                              'approx', False, True),
184
                         'anxYlo anxilo onxYlo onxilo angYlo angilo ongYlo' +
185
                         ' ongilo')
186
        self.assertEqual(bmpm('1234', '', 'ash', 'approx', False), '')
187
188
        # testSolrSEPHARDIC
189
        # concat is true, ruleType is EXACT
190
        self.assertEqual(bmpm('Angelo', '', 'sep', 'exact', True),
191
                         'anZelo andZelo anxelo')
192
        self.assertEqual(bmpm('D\'Angelo', '', 'sep', 'exact', True),
193
                         'anZelo andZelo anxelo')
194
        self.assertRaises(ValueError, bmpm, 'Angelo', 'italian,greek,spanish',
195
                          'sep', 'exact', True)
196
        self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'sep',
197
                              'exact', True, True),
198
                         'andZelo anxelo')
199
        self.assertEqual(bmpm('1234', '', 'sep', 'exact', True), '')
200
201
        # concat is false, ruleType is EXACT
202
        self.assertEqual(bmpm('Angelo', '', 'sep', 'exact', False),
203
                         'anZelo andZelo anxelo')
204
        self.assertEqual(bmpm('D\'Angelo', '', 'sep', 'exact', False),
205
                         'anZelo andZelo anxelo')
206
        self.assertRaises(ValueError, bmpm, 'Angelo', 'italian,greek,spanish',
207
                          'sep', 'exact', False)
208
        self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'sep',
209
                              'exact', False, True), 'andZelo anxelo')
210
        self.assertEqual(bmpm('1234', '', 'sep', 'exact', False), '')
211
212
        # concat is true, ruleType is APPROX
213
        self.assertEqual(bmpm('Angelo', '', 'sep', 'approx', True),
214
                         'anzila anzilu nzila nzilu anhila anhilu nhila nhilu')
215
        self.assertEqual(bmpm('D\'Angelo', '', 'sep', 'approx', True),
216
                         'anzila anzilu nzila nzilu anhila anhilu nhila nhilu')
217
        self.assertRaises(ValueError, bmpm, 'Angelo', 'italian,greek,spanish',
218
                          'sep', 'approx', True)
219
        self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'sep',
220
                              'approx', True, True),
221
                         'anzila anzilu nzila nzilu anhila anhilu nhila nhilu')
222
        self.assertEqual(bmpm('1234', '', 'sep', 'approx', True), '')
223
224
        # concat is false, ruleType is APPROX
225
        self.assertEqual(bmpm('Angelo', '', 'sep', 'approx', False),
226
                         'anzila anzilu nzila nzilu anhila anhilu nhila nhilu')
227
        self.assertEqual(bmpm('D\'Angelo', '', 'sep', 'approx', False),
228
                         'anzila anzilu nzila nzilu anhila anhilu nhila nhilu')
229
        self.assertRaises(ValueError, bmpm, 'Angelo', 'italian,greek,spanish',
230
                          'sep', 'approx', False)
231
        self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'sep',
232
                              'approx', False, True),
233
                         'anzila anzilu nzila nzilu anhila anhilu nhila nhilu')
234
        self.assertEqual(bmpm('1234', '', 'sep', 'approx', False), '')
235
236
        # testCompatibilityWithOriginalVersion
237
        self.assertEqual(bmpm('abram', '', 'gen', 'approx', False),
238
                         'abram abrom avram avrom obram obrom ovram ovrom' +
239
                         ' Ybram Ybrom abran abron obran obron')
240
        self.assertEqual(bmpm('Bendzin', '', 'gen', 'approx', False),
241
                         'binzn bindzn vindzn bintsn vintsn')
242
        self.assertEqual(bmpm('abram', '', 'ash', 'approx', False),
243
                         'abram abrom avram avrom obram obrom ovram ovrom' +
244
                         ' Ybram Ybrom ombram ombrom imbram imbrom')
245
        self.assertEqual(bmpm('Halpern', '', 'ash', 'approx', False),
246
                         'alpirn alpYrn olpirn olpYrn Ylpirn YlpYrn xalpirn' +
247
                         ' xolpirn')
248
249
        # PhoneticEngineTest
250
        self.assertEqual(bmpm('Renault', '', 'gen', 'approx', True),
251
                         'rinolt rino rinDlt rinalt rinult rinD rina rinu')
252
        self.assertEqual(bmpm('Renault', '', 'ash', 'approx', True),
253
                         'rinDlt rinalt rinult rYnDlt rYnalt rYnult rinolt')
254
        self.assertEqual(bmpm('Renault', '', 'sep', 'approx', True),
255
                         'rinDlt')
256
        self.assertEqual(bmpm('SntJohn-Smith', '', 'gen', 'exact', True),
257
                         'sntjonsmit')
258
        self.assertEqual(bmpm('d\'ortley', '', 'gen', 'exact', True),
259
                         'ortlaj ortlej dortlaj dortlej')
260
        self.assertEqual(bmpm('van helsing', '', 'gen', 'exact', False),
261
                         'helSink helsink helzink xelsink elSink elsink' +
262
                         ' vanhelsink vanhelzink vanjelsink fanhelsink' +
263
                         ' fanhelzink banhelsink')
264
265
    def test_bmpm_misc(self):
266
        """Test abydos.phonetic.bmpm.bmpm (miscellaneous tests).
267
268
        The purpose of this test set is to achieve higher code coverage
269
        and to hit some of the test cases noted in the BMPM reference code.
270
        """
271
        # test of Ashkenazi with discardable prefix
272
        self.assertEqual(bmpm('bar Hayim', name_mode='ash'), 'Dm xDm')
273
274
        # tests of concat behavior
275
        self.assertEqual(bmpm('Rodham Clinton', concat=False),
276
                         'rodam rodom rYdam rYdom rodan rodon rodxam rodxom' +
277
                         ' rodxan rodxon rudam rudom klinton klnton klintun' +
278
                         ' klntun tzlinton tzlnton tzlintun tzlntun zlinton' +
279
                         ' zlnton')
280
        self.assertEqual(bmpm('Rodham Clinton', concat=True),
281
                         'rodamklinton rodomklinton rodamklnton rodomklnton' +
282
                         ' rodamklintun rodomklintun rodamklntun rodomklntun' +
283
                         ' rodamtzlinton rodomtzlinton rodamtzlnton' +
284
                         ' rodomtzlnton rodamtzlintun rodomtzlintun' +
285
                         ' rodamtzlntun rodomtzlntun rodamzlinton' +
286
                         ' rodomzlinton rodamzlnton rodomzlnton rodanklinton' +
287
                         ' rodonklinton rodanklnton rodonklnton' +
288
                         ' rodxamklinton rodxomklinton rodxamklnton' +
289
                         ' rodxomklnton rodxanklinton rodxonklinton' +
290
                         ' rodxanklnton rodxonklnton rudamklinton' +
291
                         ' rudomklinton rudamklnton rudomklnton rudamklintun' +
292
                         ' rudomklintun rudamklntun rudomklntun' +
293
                         ' rudamtzlinton rudomtzlinton rudamtzlnton' +
294
                         ' rudomtzlnton rudamtzlintun rudomtzlintun' +
295
                         ' rudamtzlntun rudomtzlntun')
296
297
        # tests of name_mode values
298
        self.assertEqual(bmpm('bar Hayim', name_mode='ash'), 'Dm xDm')
299
        self.assertEqual(bmpm('bar Hayim', name_mode='ashkenazi'), 'Dm xDm')
300
        self.assertEqual(bmpm('bar Hayim', name_mode='Ashkenazi'), 'Dm xDm')
301
        self.assertEqual(bmpm('bar Hayim', name_mode='gen', concat=True),
302
                         'barDm borDm bYrDm varDm vorDm barDn borDn barxDm' +
303
                         ' borxDm varxDm vorxDm barxDn borxDn')
304
        self.assertEqual(bmpm('bar Hayim', name_mode='general', concat=True),
305
                         'barDm borDm bYrDm varDm vorDm barDn borDn barxDm' +
306
                         ' borxDm varxDm vorxDm barxDn borxDn')
307
        self.assertEqual(bmpm('bar Hayim', name_mode='Mizrahi', concat=True),
308
                         'barDm borDm bYrDm varDm vorDm barDn borDn barxDm' +
309
                         ' borxDm varxDm vorxDm barxDn borxDn')
310
        self.assertEqual(bmpm('bar Hayim', name_mode='mizrahi', concat=True),
311
                         'barDm borDm bYrDm varDm vorDm barDn borDn barxDm' +
312
                         ' borxDm varxDm vorxDm barxDn borxDn')
313
        self.assertEqual(bmpm('bar Hayim', name_mode='miz', concat=True),
314
                         'barDm borDm bYrDm varDm vorDm barDn borDn barxDm' +
315
                         ' borxDm varxDm vorxDm barxDn borxDn')
316
317
        # test that out-of-range language_arg results in L_ANY
318
        self.assertEqual(bmpm('Rodham Clinton', language_arg=2**32),
319
                         'rodam rodom rYdam rYdom rodan rodon rodxam rodxom' +
320
                         ' rodxan rodxon rudam rudom klinton klnton klintun' +
321
                         ' klntun tzlinton tzlnton tzlintun tzlntun zlinton' +
322
                         ' zlnton')
323
        self.assertEqual(bmpm('Rodham Clinton', language_arg=-4),
324
                         'rodam rodom rYdam rYdom rodan rodon rodxam rodxom' +
325
                         ' rodxan rodxon rudam rudom klinton klnton klintun' +
326
                         ' klntun tzlinton tzlnton tzlintun tzlntun zlinton' +
327
                         ' zlnton')
328
329
        # etc. (for code coverage)
330
        self.assertEqual(bmpm('van Damme', name_mode='sep'), 'dami mi dam m')
331
332
    def test_bmpm_nachnamen(self):
333
        """Test abydos.phonetic.bmpm.bmpm (Nachnamen set)."""
334
        if not ALLOW_RANDOM:
335
            return
336
        with codecs.open(_corpus_file('nachnamen.bm.csv'),
337
                         encoding='utf-8') as nachnamen_testset:
338
            next(nachnamen_testset)
339
            for nn_line in nachnamen_testset:
340
                nn_line = nn_line.strip().split(',')
341
                # This test set is very large (~10000 entries)
342
                # so let's just randomly select about 20 for testing
343
                if nn_line[0] != '#' and _one_in(500):
344
                    self.assertEqual(bmpm(nn_line[0], language_arg='german'),
345
                                     nn_line[1])
346
                    self.assertEqual(bmpm(nn_line[0]), nn_line[2])
347
348
    def test_bmpm_nachnamen_cc(self):
349
        """Test abydos.phonetic.bmpm.bmpm (Nachnamen, corner cases)."""
350
        with codecs.open(_corpus_file('nachnamen.bm.cc.csv'),
351
                         encoding='utf-8') as nachnamen_testset:
352
            next(nachnamen_testset)
353
            for nn_line in nachnamen_testset:
354
                nn_line = nn_line.strip().split(',')
355
                # This test set is very large (~10000 entries)
356
                # so let's just randomly select about 20 for testing
357
                if nn_line[0] != '#':
358
                    self.assertEqual(bmpm(nn_line[0], language_arg='german'),
359
                                     nn_line[1])
360
                    self.assertEqual(bmpm(nn_line[0]), nn_line[2])
361
362
    def test_bmpm_uscensus2000(self):
363
        """Test abydos.phonetic.bmpm.bmpm (US Census 2000 set)."""
364
        if not ALLOW_RANDOM:
365
            return
366
        with open(_corpus_file('uscensus2000.bm.csv')) as uscensus_ts:
367
            next(uscensus_ts)
368
            for cen_line in uscensus_ts:
369
                cen_line = cen_line.strip().split(',')
370
                # This test set is very large (~150000 entries)
371
                # so let's just randomly select about 20 for testing
372
                if cen_line[0] != '#' and _one_in(7500):
373
                    self.assertEqual(bmpm(cen_line[0], match_mode='approx',
374
                                          name_mode='gen'), cen_line[1])
375
                    self.assertEqual(bmpm(cen_line[0], match_mode='approx',
376
                                          name_mode='ash'), cen_line[2])
377
                    self.assertEqual(bmpm(cen_line[0], match_mode='approx',
378
                                          name_mode='sep'), cen_line[3])
379
                    self.assertEqual(bmpm(cen_line[0], match_mode='exact',
380
                                          name_mode='gen'), cen_line[4])
381
                    self.assertEqual(bmpm(cen_line[0], match_mode='exact',
382
                                          name_mode='ash'), cen_line[5])
383
                    self.assertEqual(bmpm(cen_line[0], match_mode='exact',
384
                                          name_mode='sep'), cen_line[6])
385
386
    def test_bmpm_uscensus2000_cc(self):
387
        """Test abydos.phonetic.bmpm.bmpm (US Census 2000, corner cases)."""
388
        with open(_corpus_file('uscensus2000.bm.cc.csv')) as uscensus_ts:
389
            next(uscensus_ts)
390
            for cen_line in uscensus_ts:
391
                cen_line = cen_line.strip().split(',')
392
                # This test set is very large (~150000 entries)
393
                # so let's just randomly select about 20 for testing
394
                if cen_line[0] != '#' and _one_in(10):
395
                    self.assertEqual(bmpm(cen_line[0], match_mode='approx',
396
                                          name_mode='gen'), cen_line[1])
397
                    self.assertEqual(bmpm(cen_line[0], match_mode='approx',
398
                                          name_mode='ash'), cen_line[2])
399
                    self.assertEqual(bmpm(cen_line[0], match_mode='approx',
400
                                          name_mode='sep'), cen_line[3])
401
                    self.assertEqual(bmpm(cen_line[0], match_mode='exact',
402
                                          name_mode='gen'), cen_line[4])
403
                    self.assertEqual(bmpm(cen_line[0], match_mode='exact',
404
                                          name_mode='ash'), cen_line[5])
405
                    self.assertEqual(bmpm(cen_line[0], match_mode='exact',
406
                                          name_mode='sep'), cen_line[6])
407
408
    def test_bm_phonetic_number(self):
409
        """Test abydos.phonetic.bmpm._bm_phonetic_number."""
410
        self.assertEqual(_bm_phonetic_number(''), '')
411
        self.assertEqual(_bm_phonetic_number('abcd'), 'abcd')
412
        self.assertEqual(_bm_phonetic_number('abcd[123]'), 'abcd')
413
        self.assertEqual(_bm_phonetic_number('abcd[123'), 'abcd')
414
        self.assertEqual(_bm_phonetic_number('abcd['), 'abcd')
415
        self.assertEqual(_bm_phonetic_number('abcd[[[123]]]'), 'abcd')
416
417
    def test_bm_apply_rule_if_compat(self):
418
        """Test abydos.phonetic.bmpm._bm_apply_rule_if_compat."""
419
        self.assertEqual(_bm_apply_rule_if_compat('abc', 'def', 4), 'abcdef')
420
        self.assertEqual(_bm_apply_rule_if_compat('abc', 'def[6]', 4),
421
                         'abcdef[4]')
422
        self.assertEqual(_bm_apply_rule_if_compat('abc', 'def[4]', 4),
423
                         'abcdef[4]')
424
        self.assertEqual(_bm_apply_rule_if_compat('abc', 'def[0]', 4), None)
425
        self.assertEqual(_bm_apply_rule_if_compat('abc', 'def[8]', 4), None)
426
        self.assertEqual(_bm_apply_rule_if_compat('abc', 'def', 1), 'abcdef')
427
        self.assertEqual(_bm_apply_rule_if_compat('abc', 'def[4]', 1),
428
                         'abcdef[4]')
429
430
    def test_bm_language(self):
431
        """Test abydos.phonetic.bmpm._bm_language.
432
433
        Most test cases from:
434
        http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/LanguageGuessingTest.java?view=markup
435
        """
436
        self.assertEqual(_bm_language('Renault', 'gen'), L_FRENCH)
437
        self.assertEqual(_bm_language('Mickiewicz', 'gen'), L_POLISH)
438
        self.assertEqual(_bm_language('Thompson', 'gen') & L_ENGLISH,
439
                         L_ENGLISH)
440
        self.assertEqual(_bm_language('Nuñez', 'gen'), L_SPANISH)
441
        self.assertEqual(_bm_language('Carvalho', 'gen'), L_PORTUGUESE)
442
        self.assertEqual(_bm_language('Čapek', 'gen'), L_CZECH | L_LATVIAN)
443
        self.assertEqual(_bm_language('Sjneijder', 'gen'), L_DUTCH)
444
        self.assertEqual(_bm_language('Klausewitz', 'gen'), L_GERMAN)
445
        self.assertEqual(_bm_language('Küçük', 'gen'), L_TURKISH)
446
        self.assertEqual(_bm_language('Giacometti', 'gen'), L_ITALIAN)
447
        self.assertEqual(_bm_language('Nagy', 'gen'), L_HUNGARIAN)
448
        self.assertEqual(_bm_language('Ceauşescu', 'gen'), L_ROMANIAN)
449
        self.assertEqual(_bm_language('Angelopoulos', 'gen'), L_GREEKLATIN)
450
        self.assertEqual(_bm_language('Αγγελόπουλος', 'gen'), L_GREEK)
451
        self.assertEqual(_bm_language('Пушкин', 'gen'), L_CYRILLIC)
452
        self.assertEqual(_bm_language('כהן', 'gen'), L_HEBREW)
453
        self.assertEqual(_bm_language('ácz', 'gen'), L_ANY)
454
        self.assertEqual(_bm_language('átz', 'gen'), L_ANY)
455
456
    def test_bm_expand_alternates(self):
457
        """Test abydos.phonetic.bmpm._bm_expand_alternates."""
458
        self.assertEqual(_bm_expand_alternates(''), '')
459
        self.assertEqual(_bm_expand_alternates('aa'), 'aa')
460
        self.assertEqual(_bm_expand_alternates('aa|bb'), 'aa|bb')
461
        self.assertEqual(_bm_expand_alternates('aa|aa'), 'aa|aa')
462
463
        self.assertEqual(_bm_expand_alternates('(aa)(bb)'), 'aabb')
464
        self.assertEqual(_bm_expand_alternates('(aa)(bb[0])'), '')
465
        self.assertEqual(_bm_expand_alternates('(aa)(bb[4])'), 'aabb[4]')
466
        self.assertEqual(_bm_expand_alternates('(aa[0])(bb)'), '')
467
        self.assertEqual(_bm_expand_alternates('(aa[4])(bb)'), 'aabb[4]')
468
469
        self.assertEqual(_bm_expand_alternates('(a|b|c)(a|b|c)'),
470
                         'aa|ab|ac|ba|bb|bc|ca|cb|cc')
471
        self.assertEqual(_bm_expand_alternates('(a[1]|b[2])(c|d)'),
472
                         'ac[1]|ad[1]|bc[2]|bd[2]')
473
        self.assertEqual(_bm_expand_alternates('(a[1]|b[2])(c[4]|d)'),
474
                         'ad[1]|bd[2]')
475
476
    def test_bm_remove_dupes(self):
477
        """Test abydos.phonetic.bmpm._bm_remove_dupes."""
478
        self.assertEqual(_bm_remove_dupes(''), '')
479
        self.assertEqual(_bm_remove_dupes('aa'), 'aa')
480
        self.assertEqual(_bm_remove_dupes('aa|bb'), 'aa|bb')
481
        self.assertEqual(_bm_remove_dupes('aa|aa'), 'aa')
482
        self.assertEqual(_bm_remove_dupes('aa|aa|aa|bb|aa'), 'aa|bb')
483
        self.assertEqual(_bm_remove_dupes('bb|aa|bb|aa|bb'), 'bb|aa')
484
485
    def test_bm_normalize_lang_attrs(self):
486
        """Test abydos.phonetic.bmpm._bm_normalize_language_attributes."""
487
        self.assertEqual(_bm_normalize_lang_attrs('', False), '')
488
        self.assertEqual(_bm_normalize_lang_attrs('', True), '')
489
490
        self.assertRaises(ValueError, _bm_normalize_lang_attrs, 'a[1', False)
491
        self.assertRaises(ValueError, _bm_normalize_lang_attrs, 'a[1', True)
492
493
        self.assertEqual(_bm_normalize_lang_attrs('abc', False), 'abc')
494
        self.assertEqual(_bm_normalize_lang_attrs('abc[0]', False), '[0]')
495
        self.assertEqual(_bm_normalize_lang_attrs('abc[2]', False), 'abc[2]')
496
        self.assertEqual(_bm_normalize_lang_attrs('abc[2][4]', False), '[0]')
497
        self.assertEqual(_bm_normalize_lang_attrs('abc[2][6]', False),
498
                         'abc[2]')
499
        self.assertEqual(_bm_normalize_lang_attrs('ab[2]c[4]', False), '[0]')
500
        self.assertEqual(_bm_normalize_lang_attrs('ab[2]c[6]', False),
501
                         'abc[2]')
502
503
        self.assertEqual(_bm_normalize_lang_attrs('abc', True), 'abc')
504
        self.assertEqual(_bm_normalize_lang_attrs('abc[0]', True), 'abc')
505
        self.assertEqual(_bm_normalize_lang_attrs('abc[2]', True), 'abc')
506
        self.assertEqual(_bm_normalize_lang_attrs('abc[2][4]', True), 'abc')
507
        self.assertEqual(_bm_normalize_lang_attrs('abc[2][6]', True), 'abc')
508
        self.assertEqual(_bm_normalize_lang_attrs('ab[2]c[4]', True), 'abc')
509
        self.assertEqual(_bm_normalize_lang_attrs('ab[2]c[6]', True), 'abc')
510
511
512
if __name__ == '__main__':
513
    unittest.main()
514