1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
|
3
|
|
|
# Copyright 2014-2018 by Christopher C. Little. |
4
|
|
|
# This file is part of Abydos. |
5
|
|
|
# |
6
|
|
|
# Abydos is free software: you can redistribute it and/or modify |
7
|
|
|
# it under the terms of the GNU General Public License as published by |
8
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
9
|
|
|
# (at your option) any later version. |
10
|
|
|
# |
11
|
|
|
# Abydos is distributed in the hope that it will be useful, |
12
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
13
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14
|
|
|
# GNU General Public License for more details. |
15
|
|
|
# |
16
|
|
|
# You should have received a copy of the GNU General Public License |
17
|
|
|
# along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
18
|
|
|
|
19
|
|
|
"""abydos.tests.test_phonetic_bmpm. |
20
|
|
|
|
21
|
|
|
This module contains unit tests for abydos.phonetic.bmpm |
22
|
|
|
""" |
23
|
|
|
|
24
|
|
|
from __future__ import unicode_literals |
25
|
|
|
|
26
|
|
|
import codecs |
27
|
|
|
import unittest |
28
|
|
|
|
29
|
|
|
# noinspection PyProtectedMember |
30
|
|
|
from abydos.phonetic._bmdata import L_ANY, L_CYRILLIC, L_CZECH, L_DUTCH, \ |
31
|
|
|
L_ENGLISH, L_FRENCH, L_GERMAN, L_GREEK, L_GREEKLATIN, L_HEBREW, \ |
32
|
|
|
L_HUNGARIAN, L_ITALIAN, L_LATVIAN, L_POLISH, L_PORTUGUESE, L_ROMANIAN, \ |
33
|
|
|
L_SPANISH, L_TURKISH |
34
|
|
|
# noinspection PyProtectedMember |
35
|
|
|
from abydos.phonetic.bmpm import _bm_apply_rule_if_compat, \ |
36
|
|
|
_bm_expand_alternates, _bm_language, _bm_normalize_lang_attrs, \ |
37
|
|
|
_bm_phonetic_number, _bm_remove_dupes, bmpm |
38
|
|
|
|
39
|
|
|
|
40
|
|
|
from six import text_type |
41
|
|
|
|
42
|
|
|
from .. import ALLOW_RANDOM, _corpus_file, _one_in |
43
|
|
|
|
44
|
|
|
|
45
|
|
|
class BeiderMorseTestCases(unittest.TestCase): |
46
|
|
|
"""Test BMPM functions. |
47
|
|
|
|
48
|
|
|
test cases for abydos.phonetic.bmpm |
49
|
|
|
""" |
50
|
|
|
|
51
|
|
|
def test_bmpm(self): |
52
|
|
|
"""Test abydos.phonetic.bmpm.bmpm. |
53
|
|
|
|
54
|
|
|
Most test cases from: |
55
|
|
|
http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/ |
56
|
|
|
|
57
|
|
|
As a rule, the test cases are copied from the above code, but the |
58
|
|
|
resultant values are not. This is largely because this Python port |
59
|
|
|
follows the PHP reference implementation much more closely than the |
60
|
|
|
Java port in Apache Commons Codec does. As a result, these tests have |
61
|
|
|
been conformed to the output produced by the PHP implementation, |
62
|
|
|
particularly in terms of formatting and ordering. |
63
|
|
|
""" |
64
|
|
|
# base cases |
65
|
|
|
self.assertEqual(bmpm(''), '') |
66
|
|
|
|
67
|
|
|
for langs in ('', 1, 'spanish', 'english,italian', 3): |
68
|
|
|
for name_mode in ('gen', 'ash', 'sep'): |
69
|
|
|
for match_mode in ('approx', 'exact'): |
70
|
|
|
for concat in (False, True): |
71
|
|
|
if (isinstance(langs, text_type) and |
72
|
|
|
((name_mode == 'ash' and 'italian' in langs) or |
73
|
|
|
(name_mode == 'sep' and 'english' in langs))): |
74
|
|
|
self.assertRaises(ValueError, bmpm, '', langs, |
75
|
|
|
name_mode, match_mode, concat) |
76
|
|
|
else: |
77
|
|
|
self.assertEqual(bmpm('', langs, name_mode, |
78
|
|
|
match_mode, concat), '') |
79
|
|
|
|
80
|
|
|
# testSolrGENERIC |
81
|
|
|
# concat is true, ruleType is EXACT |
82
|
|
|
self.assertEqual(bmpm('Angelo', '', 'gen', 'exact', True), |
83
|
|
|
'angelo anxelo anhelo anjelo anZelo andZelo') |
84
|
|
|
self.assertEqual(bmpm('D\'Angelo', '', 'gen', 'exact', True), |
85
|
|
|
'angelo anxelo anhelo anjelo anZelo andZelo dangelo' + |
86
|
|
|
' danxelo danhelo danjelo danZelo dandZelo') |
87
|
|
|
self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'gen', |
88
|
|
|
'exact', True), |
89
|
|
|
'angelo anxelo andZelo') |
90
|
|
|
self.assertEqual(bmpm('1234', '', 'gen', 'exact', True), '') |
91
|
|
|
|
92
|
|
|
# concat is false, ruleType is EXACT |
93
|
|
|
self.assertEqual(bmpm('Angelo', '', 'gen', 'exact', False), |
94
|
|
|
'angelo anxelo anhelo anjelo anZelo andZelo') |
95
|
|
|
self.assertEqual(bmpm('D\'Angelo', '', 'gen', 'exact', False), |
96
|
|
|
'angelo anxelo anhelo anjelo anZelo andZelo dangelo' + |
97
|
|
|
' danxelo danhelo danjelo danZelo dandZelo') |
98
|
|
|
self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'gen', |
99
|
|
|
'exact', False), |
100
|
|
|
'angelo anxelo andZelo') |
101
|
|
|
self.assertEqual(bmpm('1234', '', 'gen', 'exact', False), '') |
102
|
|
|
|
103
|
|
|
# concat is true, ruleType is APPROX |
104
|
|
|
self.assertEqual(bmpm('Angelo', '', 'gen', 'approx', True), |
105
|
|
|
'angilo angYlo agilo ongilo ongYlo ogilo Yngilo' + |
106
|
|
|
' YngYlo anxilo onxilo anilo onilo aniilo oniilo' + |
107
|
|
|
' anzilo onzilo') |
108
|
|
|
self.assertEqual(bmpm('D\'Angelo', '', 'gen', 'approx', True), |
109
|
|
|
'angilo angYlo agilo ongilo ongYlo ogilo Yngilo' + |
110
|
|
|
' YngYlo anxilo onxilo anilo onilo aniilo oniilo' + |
111
|
|
|
' anzilo onzilo dangilo dangYlo dagilo dongilo' + |
112
|
|
|
' dongYlo dogilo dYngilo dYngYlo danxilo donxilo' + |
113
|
|
|
' danilo donilo daniilo doniilo danzilo donzilo') |
114
|
|
|
self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'gen', |
115
|
|
|
'approx', True), |
116
|
|
|
'angilo ongilo anxilo onxilo anzilo onzilo') |
117
|
|
|
self.assertEqual(bmpm('1234', '', 'gen', 'approx', True), '') |
118
|
|
|
|
119
|
|
|
# concat is false, ruleType is APPROX |
120
|
|
|
self.assertEqual(bmpm('Angelo', '', 'gen', 'approx', False), |
121
|
|
|
'angilo angYlo agilo ongilo ongYlo ogilo Yngilo' + |
122
|
|
|
' YngYlo anxilo onxilo anilo onilo aniilo oniilo' + |
123
|
|
|
' anzilo onzilo') |
124
|
|
|
self.assertEqual(bmpm('D\'Angelo', '', 'gen', 'approx', False), |
125
|
|
|
'angilo angYlo agilo ongilo ongYlo ogilo Yngilo' + |
126
|
|
|
' YngYlo anxilo onxilo anilo onilo aniilo oniilo' + |
127
|
|
|
' anzilo onzilo dangilo dangYlo dagilo dongilo' + |
128
|
|
|
' dongYlo dogilo dYngilo dYngYlo danxilo donxilo' + |
129
|
|
|
' danilo donilo daniilo doniilo danzilo donzilo') |
130
|
|
|
self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'gen', |
131
|
|
|
'approx', False), |
132
|
|
|
'angilo ongilo anxilo onxilo anzilo onzilo') |
133
|
|
|
self.assertEqual(bmpm('1234', '', 'gen', 'approx', False), '') |
134
|
|
|
|
135
|
|
|
# testSolrASHKENAZI |
136
|
|
|
# concat is true, ruleType is EXACT |
137
|
|
|
self.assertEqual(bmpm('Angelo', '', 'ash', 'exact', True), |
138
|
|
|
'angelo andZelo anhelo anxelo') |
139
|
|
|
self.assertEqual(bmpm('D\'Angelo', '', 'ash', 'exact', True), |
140
|
|
|
'dangelo dandZelo danhelo danxelo') |
141
|
|
|
self.assertRaises(ValueError, bmpm, 'Angelo', 'italian,greek,spanish', |
142
|
|
|
'ash', 'exact', True) |
143
|
|
|
self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'ash', |
144
|
|
|
'exact', True, True), 'anxelo angelo') |
145
|
|
|
self.assertEqual(bmpm('1234', '', 'ash', 'exact', True), '') |
146
|
|
|
|
147
|
|
|
# concat is false, ruleType is EXACT |
148
|
|
|
self.assertEqual(bmpm('Angelo', '', 'ash', 'exact', False), |
149
|
|
|
'angelo andZelo anhelo anxelo') |
150
|
|
|
self.assertEqual(bmpm('D\'Angelo', '', 'ash', 'exact', False), |
151
|
|
|
'dangelo dandZelo danhelo danxelo') |
152
|
|
|
self.assertRaises(ValueError, bmpm, 'Angelo', 'italian,greek,spanish', |
153
|
|
|
'ash', 'exact', False) |
154
|
|
|
self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'ash', |
155
|
|
|
'exact', False, True), 'anxelo angelo') |
156
|
|
|
self.assertEqual(bmpm('1234', '', 'ash', 'exact', False), '') |
157
|
|
|
|
158
|
|
|
# concat is true, ruleType is APPROX |
159
|
|
|
self.assertEqual(bmpm('Angelo', '', 'ash', 'approx', True), |
160
|
|
|
'angilo angYlo ongilo ongYlo Yngilo YngYlo anzilo' + |
161
|
|
|
' onzilo anilo onilo anxilo onxilo') |
162
|
|
|
self.assertEqual(bmpm('D\'Angelo', '', 'ash', 'approx', True), |
163
|
|
|
'dangilo dangYlo dongilo dongYlo dYngilo dYngYlo' + |
164
|
|
|
' danzilo donzilo danilo donilo danxilo donxilo') |
165
|
|
|
self.assertRaises(ValueError, bmpm, 'Angelo', 'italian,greek,spanish', |
166
|
|
|
'ash', 'approx', True) |
167
|
|
|
self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'ash', |
168
|
|
|
'approx', True, True), |
169
|
|
|
'anxYlo anxilo onxYlo onxilo angYlo angilo ongYlo' + |
170
|
|
|
' ongilo') |
171
|
|
|
self.assertEqual(bmpm('1234', '', 'ash', 'approx', True), '') |
172
|
|
|
|
173
|
|
|
# concat is false, ruleType is APPROX |
174
|
|
|
self.assertEqual(bmpm('Angelo', '', 'ash', 'approx', False), |
175
|
|
|
'angilo angYlo ongilo ongYlo Yngilo YngYlo anzilo' + |
176
|
|
|
' onzilo anilo onilo anxilo onxilo') |
177
|
|
|
self.assertEqual(bmpm('D\'Angelo', '', 'ash', 'approx', False), |
178
|
|
|
'dangilo dangYlo dongilo dongYlo dYngilo dYngYlo' + |
179
|
|
|
' danzilo donzilo danilo donilo danxilo donxilo') |
180
|
|
|
self.assertRaises(ValueError, bmpm, 'Angelo', 'italian,greek,spanish', |
181
|
|
|
'ash', 'approx', False) |
182
|
|
|
self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'ash', |
183
|
|
|
'approx', False, True), |
184
|
|
|
'anxYlo anxilo onxYlo onxilo angYlo angilo ongYlo' + |
185
|
|
|
' ongilo') |
186
|
|
|
self.assertEqual(bmpm('1234', '', 'ash', 'approx', False), '') |
187
|
|
|
|
188
|
|
|
# testSolrSEPHARDIC |
189
|
|
|
# concat is true, ruleType is EXACT |
190
|
|
|
self.assertEqual(bmpm('Angelo', '', 'sep', 'exact', True), |
191
|
|
|
'anZelo andZelo anxelo') |
192
|
|
|
self.assertEqual(bmpm('D\'Angelo', '', 'sep', 'exact', True), |
193
|
|
|
'anZelo andZelo anxelo') |
194
|
|
|
self.assertRaises(ValueError, bmpm, 'Angelo', 'italian,greek,spanish', |
195
|
|
|
'sep', 'exact', True) |
196
|
|
|
self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'sep', |
197
|
|
|
'exact', True, True), |
198
|
|
|
'andZelo anxelo') |
199
|
|
|
self.assertEqual(bmpm('1234', '', 'sep', 'exact', True), '') |
200
|
|
|
|
201
|
|
|
# concat is false, ruleType is EXACT |
202
|
|
|
self.assertEqual(bmpm('Angelo', '', 'sep', 'exact', False), |
203
|
|
|
'anZelo andZelo anxelo') |
204
|
|
|
self.assertEqual(bmpm('D\'Angelo', '', 'sep', 'exact', False), |
205
|
|
|
'anZelo andZelo anxelo') |
206
|
|
|
self.assertRaises(ValueError, bmpm, 'Angelo', 'italian,greek,spanish', |
207
|
|
|
'sep', 'exact', False) |
208
|
|
|
self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'sep', |
209
|
|
|
'exact', False, True), 'andZelo anxelo') |
210
|
|
|
self.assertEqual(bmpm('1234', '', 'sep', 'exact', False), '') |
211
|
|
|
|
212
|
|
|
# concat is true, ruleType is APPROX |
213
|
|
|
self.assertEqual(bmpm('Angelo', '', 'sep', 'approx', True), |
214
|
|
|
'anzila anzilu nzila nzilu anhila anhilu nhila nhilu') |
215
|
|
|
self.assertEqual(bmpm('D\'Angelo', '', 'sep', 'approx', True), |
216
|
|
|
'anzila anzilu nzila nzilu anhila anhilu nhila nhilu') |
217
|
|
|
self.assertRaises(ValueError, bmpm, 'Angelo', 'italian,greek,spanish', |
218
|
|
|
'sep', 'approx', True) |
219
|
|
|
self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'sep', |
220
|
|
|
'approx', True, True), |
221
|
|
|
'anzila anzilu nzila nzilu anhila anhilu nhila nhilu') |
222
|
|
|
self.assertEqual(bmpm('1234', '', 'sep', 'approx', True), '') |
223
|
|
|
|
224
|
|
|
# concat is false, ruleType is APPROX |
225
|
|
|
self.assertEqual(bmpm('Angelo', '', 'sep', 'approx', False), |
226
|
|
|
'anzila anzilu nzila nzilu anhila anhilu nhila nhilu') |
227
|
|
|
self.assertEqual(bmpm('D\'Angelo', '', 'sep', 'approx', False), |
228
|
|
|
'anzila anzilu nzila nzilu anhila anhilu nhila nhilu') |
229
|
|
|
self.assertRaises(ValueError, bmpm, 'Angelo', 'italian,greek,spanish', |
230
|
|
|
'sep', 'approx', False) |
231
|
|
|
self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'sep', |
232
|
|
|
'approx', False, True), |
233
|
|
|
'anzila anzilu nzila nzilu anhila anhilu nhila nhilu') |
234
|
|
|
self.assertEqual(bmpm('1234', '', 'sep', 'approx', False), '') |
235
|
|
|
|
236
|
|
|
# testCompatibilityWithOriginalVersion |
237
|
|
|
self.assertEqual(bmpm('abram', '', 'gen', 'approx', False), |
238
|
|
|
'abram abrom avram avrom obram obrom ovram ovrom' + |
239
|
|
|
' Ybram Ybrom abran abron obran obron') |
240
|
|
|
self.assertEqual(bmpm('Bendzin', '', 'gen', 'approx', False), |
241
|
|
|
'binzn bindzn vindzn bintsn vintsn') |
242
|
|
|
self.assertEqual(bmpm('abram', '', 'ash', 'approx', False), |
243
|
|
|
'abram abrom avram avrom obram obrom ovram ovrom' + |
244
|
|
|
' Ybram Ybrom ombram ombrom imbram imbrom') |
245
|
|
|
self.assertEqual(bmpm('Halpern', '', 'ash', 'approx', False), |
246
|
|
|
'alpirn alpYrn olpirn olpYrn Ylpirn YlpYrn xalpirn' + |
247
|
|
|
' xolpirn') |
248
|
|
|
|
249
|
|
|
# PhoneticEngineTest |
250
|
|
|
self.assertEqual(bmpm('Renault', '', 'gen', 'approx', True), |
251
|
|
|
'rinolt rino rinDlt rinalt rinult rinD rina rinu') |
252
|
|
|
self.assertEqual(bmpm('Renault', '', 'ash', 'approx', True), |
253
|
|
|
'rinDlt rinalt rinult rYnDlt rYnalt rYnult rinolt') |
254
|
|
|
self.assertEqual(bmpm('Renault', '', 'sep', 'approx', True), |
255
|
|
|
'rinDlt') |
256
|
|
|
self.assertEqual(bmpm('SntJohn-Smith', '', 'gen', 'exact', True), |
257
|
|
|
'sntjonsmit') |
258
|
|
|
self.assertEqual(bmpm('d\'ortley', '', 'gen', 'exact', True), |
259
|
|
|
'ortlaj ortlej dortlaj dortlej') |
260
|
|
|
self.assertEqual(bmpm('van helsing', '', 'gen', 'exact', False), |
261
|
|
|
'helSink helsink helzink xelsink elSink elsink' + |
262
|
|
|
' vanhelsink vanhelzink vanjelsink fanhelsink' + |
263
|
|
|
' fanhelzink banhelsink') |
264
|
|
|
|
265
|
|
|
def test_bmpm_misc(self): |
266
|
|
|
"""Test abydos.phonetic.bmpm.bmpm (miscellaneous tests). |
267
|
|
|
|
268
|
|
|
The purpose of this test set is to achieve higher code coverage |
269
|
|
|
and to hit some of the test cases noted in the BMPM reference code. |
270
|
|
|
""" |
271
|
|
|
# test of Ashkenazi with discardable prefix |
272
|
|
|
self.assertEqual(bmpm('bar Hayim', name_mode='ash'), 'Dm xDm') |
273
|
|
|
|
274
|
|
|
# tests of concat behavior |
275
|
|
|
self.assertEqual(bmpm('Rodham Clinton', concat=False), |
276
|
|
|
'rodam rodom rYdam rYdom rodan rodon rodxam rodxom' + |
277
|
|
|
' rodxan rodxon rudam rudom klinton klnton klintun' + |
278
|
|
|
' klntun tzlinton tzlnton tzlintun tzlntun zlinton' + |
279
|
|
|
' zlnton') |
280
|
|
|
self.assertEqual(bmpm('Rodham Clinton', concat=True), |
281
|
|
|
'rodamklinton rodomklinton rodamklnton rodomklnton' + |
282
|
|
|
' rodamklintun rodomklintun rodamklntun rodomklntun' + |
283
|
|
|
' rodamtzlinton rodomtzlinton rodamtzlnton' + |
284
|
|
|
' rodomtzlnton rodamtzlintun rodomtzlintun' + |
285
|
|
|
' rodamtzlntun rodomtzlntun rodamzlinton' + |
286
|
|
|
' rodomzlinton rodamzlnton rodomzlnton rodanklinton' + |
287
|
|
|
' rodonklinton rodanklnton rodonklnton' + |
288
|
|
|
' rodxamklinton rodxomklinton rodxamklnton' + |
289
|
|
|
' rodxomklnton rodxanklinton rodxonklinton' + |
290
|
|
|
' rodxanklnton rodxonklnton rudamklinton' + |
291
|
|
|
' rudomklinton rudamklnton rudomklnton rudamklintun' + |
292
|
|
|
' rudomklintun rudamklntun rudomklntun' + |
293
|
|
|
' rudamtzlinton rudomtzlinton rudamtzlnton' + |
294
|
|
|
' rudomtzlnton rudamtzlintun rudomtzlintun' + |
295
|
|
|
' rudamtzlntun rudomtzlntun') |
296
|
|
|
|
297
|
|
|
# tests of name_mode values |
298
|
|
|
self.assertEqual(bmpm('bar Hayim', name_mode='ash'), 'Dm xDm') |
299
|
|
|
self.assertEqual(bmpm('bar Hayim', name_mode='ashkenazi'), 'Dm xDm') |
300
|
|
|
self.assertEqual(bmpm('bar Hayim', name_mode='Ashkenazi'), 'Dm xDm') |
301
|
|
|
self.assertEqual(bmpm('bar Hayim', name_mode='gen', concat=True), |
302
|
|
|
'barDm borDm bYrDm varDm vorDm barDn borDn barxDm' + |
303
|
|
|
' borxDm varxDm vorxDm barxDn borxDn') |
304
|
|
|
self.assertEqual(bmpm('bar Hayim', name_mode='general', concat=True), |
305
|
|
|
'barDm borDm bYrDm varDm vorDm barDn borDn barxDm' + |
306
|
|
|
' borxDm varxDm vorxDm barxDn borxDn') |
307
|
|
|
self.assertEqual(bmpm('bar Hayim', name_mode='Mizrahi', concat=True), |
308
|
|
|
'barDm borDm bYrDm varDm vorDm barDn borDn barxDm' + |
309
|
|
|
' borxDm varxDm vorxDm barxDn borxDn') |
310
|
|
|
self.assertEqual(bmpm('bar Hayim', name_mode='mizrahi', concat=True), |
311
|
|
|
'barDm borDm bYrDm varDm vorDm barDn borDn barxDm' + |
312
|
|
|
' borxDm varxDm vorxDm barxDn borxDn') |
313
|
|
|
self.assertEqual(bmpm('bar Hayim', name_mode='miz', concat=True), |
314
|
|
|
'barDm borDm bYrDm varDm vorDm barDn borDn barxDm' + |
315
|
|
|
' borxDm varxDm vorxDm barxDn borxDn') |
316
|
|
|
|
317
|
|
|
# test that out-of-range language_arg results in L_ANY |
318
|
|
|
self.assertEqual(bmpm('Rodham Clinton', language_arg=2**32), |
319
|
|
|
'rodam rodom rYdam rYdom rodan rodon rodxam rodxom' + |
320
|
|
|
' rodxan rodxon rudam rudom klinton klnton klintun' + |
321
|
|
|
' klntun tzlinton tzlnton tzlintun tzlntun zlinton' + |
322
|
|
|
' zlnton') |
323
|
|
|
self.assertEqual(bmpm('Rodham Clinton', language_arg=-4), |
324
|
|
|
'rodam rodom rYdam rYdom rodan rodon rodxam rodxom' + |
325
|
|
|
' rodxan rodxon rudam rudom klinton klnton klintun' + |
326
|
|
|
' klntun tzlinton tzlnton tzlintun tzlntun zlinton' + |
327
|
|
|
' zlnton') |
328
|
|
|
|
329
|
|
|
# etc. (for code coverage) |
330
|
|
|
self.assertEqual(bmpm('van Damme', name_mode='sep'), 'dami mi dam m') |
331
|
|
|
|
332
|
|
|
def test_bmpm_nachnamen(self): |
333
|
|
|
"""Test abydos.phonetic.bmpm.bmpm (Nachnamen set).""" |
334
|
|
|
if not ALLOW_RANDOM: |
335
|
|
|
return |
336
|
|
|
with codecs.open(_corpus_file('nachnamen.bm.csv'), |
337
|
|
|
encoding='utf-8') as nachnamen_testset: |
338
|
|
|
next(nachnamen_testset) |
339
|
|
|
for nn_line in nachnamen_testset: |
340
|
|
|
nn_line = nn_line.strip().split(',') |
341
|
|
|
# This test set is very large (~10000 entries) |
342
|
|
|
# so let's just randomly select about 20 for testing |
343
|
|
|
if nn_line[0] != '#' and _one_in(500): |
344
|
|
|
self.assertEqual(bmpm(nn_line[0], language_arg='german'), |
345
|
|
|
nn_line[1]) |
346
|
|
|
self.assertEqual(bmpm(nn_line[0]), nn_line[2]) |
347
|
|
|
|
348
|
|
|
def test_bmpm_nachnamen_cc(self): |
349
|
|
|
"""Test abydos.phonetic.bmpm.bmpm (Nachnamen, corner cases).""" |
350
|
|
|
with codecs.open(_corpus_file('nachnamen.bm.cc.csv'), |
351
|
|
|
encoding='utf-8') as nachnamen_testset: |
352
|
|
|
next(nachnamen_testset) |
353
|
|
|
for nn_line in nachnamen_testset: |
354
|
|
|
nn_line = nn_line.strip().split(',') |
355
|
|
|
# This test set is very large (~10000 entries) |
356
|
|
|
# so let's just randomly select about 20 for testing |
357
|
|
|
if nn_line[0] != '#': |
358
|
|
|
self.assertEqual(bmpm(nn_line[0], language_arg='german'), |
359
|
|
|
nn_line[1]) |
360
|
|
|
self.assertEqual(bmpm(nn_line[0]), nn_line[2]) |
361
|
|
|
|
362
|
|
|
def test_bmpm_uscensus2000(self): |
363
|
|
|
"""Test abydos.phonetic.bmpm.bmpm (US Census 2000 set).""" |
364
|
|
|
if not ALLOW_RANDOM: |
365
|
|
|
return |
366
|
|
|
with open(_corpus_file('uscensus2000.bm.csv')) as uscensus_ts: |
367
|
|
|
next(uscensus_ts) |
368
|
|
|
for cen_line in uscensus_ts: |
369
|
|
|
cen_line = cen_line.strip().split(',') |
370
|
|
|
# This test set is very large (~150000 entries) |
371
|
|
|
# so let's just randomly select about 20 for testing |
372
|
|
|
if cen_line[0] != '#' and _one_in(7500): |
373
|
|
|
self.assertEqual(bmpm(cen_line[0], match_mode='approx', |
374
|
|
|
name_mode='gen'), cen_line[1]) |
375
|
|
|
self.assertEqual(bmpm(cen_line[0], match_mode='approx', |
376
|
|
|
name_mode='ash'), cen_line[2]) |
377
|
|
|
self.assertEqual(bmpm(cen_line[0], match_mode='approx', |
378
|
|
|
name_mode='sep'), cen_line[3]) |
379
|
|
|
self.assertEqual(bmpm(cen_line[0], match_mode='exact', |
380
|
|
|
name_mode='gen'), cen_line[4]) |
381
|
|
|
self.assertEqual(bmpm(cen_line[0], match_mode='exact', |
382
|
|
|
name_mode='ash'), cen_line[5]) |
383
|
|
|
self.assertEqual(bmpm(cen_line[0], match_mode='exact', |
384
|
|
|
name_mode='sep'), cen_line[6]) |
385
|
|
|
|
386
|
|
|
def test_bmpm_uscensus2000_cc(self): |
387
|
|
|
"""Test abydos.phonetic.bmpm.bmpm (US Census 2000, corner cases).""" |
388
|
|
|
with open(_corpus_file('uscensus2000.bm.cc.csv')) as uscensus_ts: |
389
|
|
|
next(uscensus_ts) |
390
|
|
|
for cen_line in uscensus_ts: |
391
|
|
|
cen_line = cen_line.strip().split(',') |
392
|
|
|
# This test set is very large (~150000 entries) |
393
|
|
|
# so let's just randomly select about 20 for testing |
394
|
|
|
if cen_line[0] != '#' and _one_in(10): |
395
|
|
|
self.assertEqual(bmpm(cen_line[0], match_mode='approx', |
396
|
|
|
name_mode='gen'), cen_line[1]) |
397
|
|
|
self.assertEqual(bmpm(cen_line[0], match_mode='approx', |
398
|
|
|
name_mode='ash'), cen_line[2]) |
399
|
|
|
self.assertEqual(bmpm(cen_line[0], match_mode='approx', |
400
|
|
|
name_mode='sep'), cen_line[3]) |
401
|
|
|
self.assertEqual(bmpm(cen_line[0], match_mode='exact', |
402
|
|
|
name_mode='gen'), cen_line[4]) |
403
|
|
|
self.assertEqual(bmpm(cen_line[0], match_mode='exact', |
404
|
|
|
name_mode='ash'), cen_line[5]) |
405
|
|
|
self.assertEqual(bmpm(cen_line[0], match_mode='exact', |
406
|
|
|
name_mode='sep'), cen_line[6]) |
407
|
|
|
|
408
|
|
|
def test_bm_phonetic_number(self): |
409
|
|
|
"""Test abydos.phonetic.bmpm._bm_phonetic_number.""" |
410
|
|
|
self.assertEqual(_bm_phonetic_number(''), '') |
411
|
|
|
self.assertEqual(_bm_phonetic_number('abcd'), 'abcd') |
412
|
|
|
self.assertEqual(_bm_phonetic_number('abcd[123]'), 'abcd') |
413
|
|
|
self.assertEqual(_bm_phonetic_number('abcd[123'), 'abcd') |
414
|
|
|
self.assertEqual(_bm_phonetic_number('abcd['), 'abcd') |
415
|
|
|
self.assertEqual(_bm_phonetic_number('abcd[[[123]]]'), 'abcd') |
416
|
|
|
|
417
|
|
|
def test_bm_apply_rule_if_compat(self): |
418
|
|
|
"""Test abydos.phonetic.bmpm._bm_apply_rule_if_compat.""" |
419
|
|
|
self.assertEqual(_bm_apply_rule_if_compat('abc', 'def', 4), 'abcdef') |
420
|
|
|
self.assertEqual(_bm_apply_rule_if_compat('abc', 'def[6]', 4), |
421
|
|
|
'abcdef[4]') |
422
|
|
|
self.assertEqual(_bm_apply_rule_if_compat('abc', 'def[4]', 4), |
423
|
|
|
'abcdef[4]') |
424
|
|
|
self.assertEqual(_bm_apply_rule_if_compat('abc', 'def[0]', 4), None) |
425
|
|
|
self.assertEqual(_bm_apply_rule_if_compat('abc', 'def[8]', 4), None) |
426
|
|
|
self.assertEqual(_bm_apply_rule_if_compat('abc', 'def', 1), 'abcdef') |
427
|
|
|
self.assertEqual(_bm_apply_rule_if_compat('abc', 'def[4]', 1), |
428
|
|
|
'abcdef[4]') |
429
|
|
|
|
430
|
|
|
def test_bm_language(self): |
431
|
|
|
"""Test abydos.phonetic.bmpm._bm_language. |
432
|
|
|
|
433
|
|
|
Most test cases from: |
434
|
|
|
http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/LanguageGuessingTest.java?view=markup |
435
|
|
|
""" |
436
|
|
|
self.assertEqual(_bm_language('Renault', 'gen'), L_FRENCH) |
437
|
|
|
self.assertEqual(_bm_language('Mickiewicz', 'gen'), L_POLISH) |
438
|
|
|
self.assertEqual(_bm_language('Thompson', 'gen') & L_ENGLISH, |
439
|
|
|
L_ENGLISH) |
440
|
|
|
self.assertEqual(_bm_language('Nuñez', 'gen'), L_SPANISH) |
441
|
|
|
self.assertEqual(_bm_language('Carvalho', 'gen'), L_PORTUGUESE) |
442
|
|
|
self.assertEqual(_bm_language('Čapek', 'gen'), L_CZECH | L_LATVIAN) |
443
|
|
|
self.assertEqual(_bm_language('Sjneijder', 'gen'), L_DUTCH) |
444
|
|
|
self.assertEqual(_bm_language('Klausewitz', 'gen'), L_GERMAN) |
445
|
|
|
self.assertEqual(_bm_language('Küçük', 'gen'), L_TURKISH) |
446
|
|
|
self.assertEqual(_bm_language('Giacometti', 'gen'), L_ITALIAN) |
447
|
|
|
self.assertEqual(_bm_language('Nagy', 'gen'), L_HUNGARIAN) |
448
|
|
|
self.assertEqual(_bm_language('Ceauşescu', 'gen'), L_ROMANIAN) |
449
|
|
|
self.assertEqual(_bm_language('Angelopoulos', 'gen'), L_GREEKLATIN) |
450
|
|
|
self.assertEqual(_bm_language('Αγγελόπουλος', 'gen'), L_GREEK) |
451
|
|
|
self.assertEqual(_bm_language('Пушкин', 'gen'), L_CYRILLIC) |
452
|
|
|
self.assertEqual(_bm_language('כהן', 'gen'), L_HEBREW) |
453
|
|
|
self.assertEqual(_bm_language('ácz', 'gen'), L_ANY) |
454
|
|
|
self.assertEqual(_bm_language('átz', 'gen'), L_ANY) |
455
|
|
|
|
456
|
|
|
def test_bm_expand_alternates(self): |
457
|
|
|
"""Test abydos.phonetic.bmpm._bm_expand_alternates.""" |
458
|
|
|
self.assertEqual(_bm_expand_alternates(''), '') |
459
|
|
|
self.assertEqual(_bm_expand_alternates('aa'), 'aa') |
460
|
|
|
self.assertEqual(_bm_expand_alternates('aa|bb'), 'aa|bb') |
461
|
|
|
self.assertEqual(_bm_expand_alternates('aa|aa'), 'aa|aa') |
462
|
|
|
|
463
|
|
|
self.assertEqual(_bm_expand_alternates('(aa)(bb)'), 'aabb') |
464
|
|
|
self.assertEqual(_bm_expand_alternates('(aa)(bb[0])'), '') |
465
|
|
|
self.assertEqual(_bm_expand_alternates('(aa)(bb[4])'), 'aabb[4]') |
466
|
|
|
self.assertEqual(_bm_expand_alternates('(aa[0])(bb)'), '') |
467
|
|
|
self.assertEqual(_bm_expand_alternates('(aa[4])(bb)'), 'aabb[4]') |
468
|
|
|
|
469
|
|
|
self.assertEqual(_bm_expand_alternates('(a|b|c)(a|b|c)'), |
470
|
|
|
'aa|ab|ac|ba|bb|bc|ca|cb|cc') |
471
|
|
|
self.assertEqual(_bm_expand_alternates('(a[1]|b[2])(c|d)'), |
472
|
|
|
'ac[1]|ad[1]|bc[2]|bd[2]') |
473
|
|
|
self.assertEqual(_bm_expand_alternates('(a[1]|b[2])(c[4]|d)'), |
474
|
|
|
'ad[1]|bd[2]') |
475
|
|
|
|
476
|
|
|
def test_bm_remove_dupes(self): |
477
|
|
|
"""Test abydos.phonetic.bmpm._bm_remove_dupes.""" |
478
|
|
|
self.assertEqual(_bm_remove_dupes(''), '') |
479
|
|
|
self.assertEqual(_bm_remove_dupes('aa'), 'aa') |
480
|
|
|
self.assertEqual(_bm_remove_dupes('aa|bb'), 'aa|bb') |
481
|
|
|
self.assertEqual(_bm_remove_dupes('aa|aa'), 'aa') |
482
|
|
|
self.assertEqual(_bm_remove_dupes('aa|aa|aa|bb|aa'), 'aa|bb') |
483
|
|
|
self.assertEqual(_bm_remove_dupes('bb|aa|bb|aa|bb'), 'bb|aa') |
484
|
|
|
|
485
|
|
|
def test_bm_normalize_lang_attrs(self): |
486
|
|
|
"""Test abydos.phonetic.bmpm._bm_normalize_language_attributes.""" |
487
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('', False), '') |
488
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('', True), '') |
489
|
|
|
|
490
|
|
|
self.assertRaises(ValueError, _bm_normalize_lang_attrs, 'a[1', False) |
491
|
|
|
self.assertRaises(ValueError, _bm_normalize_lang_attrs, 'a[1', True) |
492
|
|
|
|
493
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('abc', False), 'abc') |
494
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('abc[0]', False), '[0]') |
495
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('abc[2]', False), 'abc[2]') |
496
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('abc[2][4]', False), '[0]') |
497
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('abc[2][6]', False), |
498
|
|
|
'abc[2]') |
499
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('ab[2]c[4]', False), '[0]') |
500
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('ab[2]c[6]', False), |
501
|
|
|
'abc[2]') |
502
|
|
|
|
503
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('abc', True), 'abc') |
504
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('abc[0]', True), 'abc') |
505
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('abc[2]', True), 'abc') |
506
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('abc[2][4]', True), 'abc') |
507
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('abc[2][6]', True), 'abc') |
508
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('ab[2]c[4]', True), 'abc') |
509
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('ab[2]c[6]', True), 'abc') |
510
|
|
|
|
511
|
|
|
|
512
|
|
|
if __name__ == '__main__': |
513
|
|
|
unittest.main() |
514
|
|
|
|