|
1
|
|
|
# -*- coding: utf-8 -*- |
|
2
|
|
|
|
|
3
|
|
|
# Copyright 2014-2018 by Christopher C. Little. |
|
4
|
|
|
# This file is part of Abydos. |
|
5
|
|
|
# |
|
6
|
|
|
# Abydos is free software: you can redistribute it and/or modify |
|
7
|
|
|
# it under the terms of the GNU General Public License as published by |
|
8
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
|
9
|
|
|
# (at your option) any later version. |
|
10
|
|
|
# |
|
11
|
|
|
# Abydos is distributed in the hope that it will be useful, |
|
12
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
13
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
14
|
|
|
# GNU General Public License for more details. |
|
15
|
|
|
# |
|
16
|
|
|
# You should have received a copy of the GNU General Public License |
|
17
|
|
|
# along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
|
18
|
|
|
|
|
19
|
|
|
"""abydos.tests.test_phonetic_bmpm. |
|
20
|
|
|
|
|
21
|
|
|
This module contains unit tests for abydos.phonetic.bmpm |
|
22
|
|
|
""" |
|
23
|
|
|
|
|
24
|
|
|
from __future__ import unicode_literals |
|
25
|
|
|
|
|
26
|
|
|
import codecs |
|
27
|
|
|
import unittest |
|
28
|
|
|
|
|
29
|
|
|
# noinspection PyProtectedMember |
|
30
|
|
|
from abydos.phonetic._bmdata import L_ANY, L_CYRILLIC, L_CZECH, L_DUTCH, \ |
|
31
|
|
|
L_ENGLISH, L_FRENCH, L_GERMAN, L_GREEK, L_GREEKLATIN, L_HEBREW, \ |
|
32
|
|
|
L_HUNGARIAN, L_ITALIAN, L_LATVIAN, L_POLISH, L_PORTUGUESE, L_ROMANIAN, \ |
|
33
|
|
|
L_SPANISH, L_TURKISH |
|
34
|
|
|
# noinspection PyProtectedMember |
|
35
|
|
|
from abydos.phonetic.bmpm import _bm_apply_rule_if_compat, \ |
|
36
|
|
|
_bm_expand_alternates, _bm_language, _bm_normalize_lang_attrs, \ |
|
37
|
|
|
_bm_phonetic_number, _bm_remove_dupes, bmpm |
|
38
|
|
|
|
|
39
|
|
|
|
|
40
|
|
|
from six import text_type |
|
41
|
|
|
|
|
42
|
|
|
from .. import ALLOW_RANDOM, _corpus_file, _one_in |
|
43
|
|
|
|
|
44
|
|
|
|
|
45
|
|
|
class BeiderMorseTestCases(unittest.TestCase): |
|
46
|
|
|
"""Test BMPM functions. |
|
47
|
|
|
|
|
48
|
|
|
test cases for abydos.phonetic.bmpm |
|
49
|
|
|
""" |
|
50
|
|
|
|
|
51
|
|
|
def test_bmpm(self): |
|
52
|
|
|
"""Test abydos.phonetic.bmpm.bmpm. |
|
53
|
|
|
|
|
54
|
|
|
Most test cases from: |
|
55
|
|
|
http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/ |
|
56
|
|
|
|
|
57
|
|
|
As a rule, the test cases are copied from the above code, but the |
|
58
|
|
|
resultant values are not. This is largely because this Python port |
|
59
|
|
|
follows the PHP reference implementation much more closely than the |
|
60
|
|
|
Java port in Apache Commons Codec does. As a result, these tests have |
|
61
|
|
|
been conformed to the output produced by the PHP implementation, |
|
62
|
|
|
particularly in terms of formatting and ordering. |
|
63
|
|
|
""" |
|
64
|
|
|
# base cases |
|
65
|
|
|
self.assertEqual(bmpm(''), '') |
|
66
|
|
|
|
|
67
|
|
|
for langs in ('', 1, 'spanish', 'english,italian', 3): |
|
68
|
|
|
for name_mode in ('gen', 'ash', 'sep'): |
|
69
|
|
|
for match_mode in ('approx', 'exact'): |
|
70
|
|
|
for concat in (False, True): |
|
71
|
|
|
if (isinstance(langs, text_type) and |
|
72
|
|
|
((name_mode == 'ash' and 'italian' in langs) or |
|
73
|
|
|
(name_mode == 'sep' and 'english' in langs))): |
|
74
|
|
|
self.assertRaises(ValueError, bmpm, '', langs, |
|
75
|
|
|
name_mode, match_mode, concat) |
|
76
|
|
|
else: |
|
77
|
|
|
self.assertEqual(bmpm('', langs, name_mode, |
|
78
|
|
|
match_mode, concat), '') |
|
79
|
|
|
|
|
80
|
|
|
# testSolrGENERIC |
|
81
|
|
|
# concat is true, ruleType is EXACT |
|
82
|
|
|
self.assertEqual(bmpm('Angelo', '', 'gen', 'exact', True), |
|
83
|
|
|
'angelo anxelo anhelo anjelo anZelo andZelo') |
|
84
|
|
|
self.assertEqual(bmpm('D\'Angelo', '', 'gen', 'exact', True), |
|
85
|
|
|
'angelo anxelo anhelo anjelo anZelo andZelo dangelo' + |
|
86
|
|
|
' danxelo danhelo danjelo danZelo dandZelo') |
|
87
|
|
|
self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'gen', |
|
88
|
|
|
'exact', True), |
|
89
|
|
|
'angelo anxelo andZelo') |
|
90
|
|
|
self.assertEqual(bmpm('1234', '', 'gen', 'exact', True), '') |
|
91
|
|
|
|
|
92
|
|
|
# concat is false, ruleType is EXACT |
|
93
|
|
|
self.assertEqual(bmpm('Angelo', '', 'gen', 'exact', False), |
|
94
|
|
|
'angelo anxelo anhelo anjelo anZelo andZelo') |
|
95
|
|
|
self.assertEqual(bmpm('D\'Angelo', '', 'gen', 'exact', False), |
|
96
|
|
|
'angelo anxelo anhelo anjelo anZelo andZelo dangelo' + |
|
97
|
|
|
' danxelo danhelo danjelo danZelo dandZelo') |
|
98
|
|
|
self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'gen', |
|
99
|
|
|
'exact', False), |
|
100
|
|
|
'angelo anxelo andZelo') |
|
101
|
|
|
self.assertEqual(bmpm('1234', '', 'gen', 'exact', False), '') |
|
102
|
|
|
|
|
103
|
|
|
# concat is true, ruleType is APPROX |
|
104
|
|
|
self.assertEqual(bmpm('Angelo', '', 'gen', 'approx', True), |
|
105
|
|
|
'angilo angYlo agilo ongilo ongYlo ogilo Yngilo' + |
|
106
|
|
|
' YngYlo anxilo onxilo anilo onilo aniilo oniilo' + |
|
107
|
|
|
' anzilo onzilo') |
|
108
|
|
|
self.assertEqual(bmpm('D\'Angelo', '', 'gen', 'approx', True), |
|
109
|
|
|
'angilo angYlo agilo ongilo ongYlo ogilo Yngilo' + |
|
110
|
|
|
' YngYlo anxilo onxilo anilo onilo aniilo oniilo' + |
|
111
|
|
|
' anzilo onzilo dangilo dangYlo dagilo dongilo' + |
|
112
|
|
|
' dongYlo dogilo dYngilo dYngYlo danxilo donxilo' + |
|
113
|
|
|
' danilo donilo daniilo doniilo danzilo donzilo') |
|
114
|
|
|
self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'gen', |
|
115
|
|
|
'approx', True), |
|
116
|
|
|
'angilo ongilo anxilo onxilo anzilo onzilo') |
|
117
|
|
|
self.assertEqual(bmpm('1234', '', 'gen', 'approx', True), '') |
|
118
|
|
|
|
|
119
|
|
|
# concat is false, ruleType is APPROX |
|
120
|
|
|
self.assertEqual(bmpm('Angelo', '', 'gen', 'approx', False), |
|
121
|
|
|
'angilo angYlo agilo ongilo ongYlo ogilo Yngilo' + |
|
122
|
|
|
' YngYlo anxilo onxilo anilo onilo aniilo oniilo' + |
|
123
|
|
|
' anzilo onzilo') |
|
124
|
|
|
self.assertEqual(bmpm('D\'Angelo', '', 'gen', 'approx', False), |
|
125
|
|
|
'angilo angYlo agilo ongilo ongYlo ogilo Yngilo' + |
|
126
|
|
|
' YngYlo anxilo onxilo anilo onilo aniilo oniilo' + |
|
127
|
|
|
' anzilo onzilo dangilo dangYlo dagilo dongilo' + |
|
128
|
|
|
' dongYlo dogilo dYngilo dYngYlo danxilo donxilo' + |
|
129
|
|
|
' danilo donilo daniilo doniilo danzilo donzilo') |
|
130
|
|
|
self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'gen', |
|
131
|
|
|
'approx', False), |
|
132
|
|
|
'angilo ongilo anxilo onxilo anzilo onzilo') |
|
133
|
|
|
self.assertEqual(bmpm('1234', '', 'gen', 'approx', False), '') |
|
134
|
|
|
|
|
135
|
|
|
# testSolrASHKENAZI |
|
136
|
|
|
# concat is true, ruleType is EXACT |
|
137
|
|
|
self.assertEqual(bmpm('Angelo', '', 'ash', 'exact', True), |
|
138
|
|
|
'angelo andZelo anhelo anxelo') |
|
139
|
|
|
self.assertEqual(bmpm('D\'Angelo', '', 'ash', 'exact', True), |
|
140
|
|
|
'dangelo dandZelo danhelo danxelo') |
|
141
|
|
|
self.assertRaises(ValueError, bmpm, 'Angelo', 'italian,greek,spanish', |
|
142
|
|
|
'ash', 'exact', True) |
|
143
|
|
|
self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'ash', |
|
144
|
|
|
'exact', True, True), 'anxelo angelo') |
|
145
|
|
|
self.assertEqual(bmpm('1234', '', 'ash', 'exact', True), '') |
|
146
|
|
|
|
|
147
|
|
|
# concat is false, ruleType is EXACT |
|
148
|
|
|
self.assertEqual(bmpm('Angelo', '', 'ash', 'exact', False), |
|
149
|
|
|
'angelo andZelo anhelo anxelo') |
|
150
|
|
|
self.assertEqual(bmpm('D\'Angelo', '', 'ash', 'exact', False), |
|
151
|
|
|
'dangelo dandZelo danhelo danxelo') |
|
152
|
|
|
self.assertRaises(ValueError, bmpm, 'Angelo', 'italian,greek,spanish', |
|
153
|
|
|
'ash', 'exact', False) |
|
154
|
|
|
self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'ash', |
|
155
|
|
|
'exact', False, True), 'anxelo angelo') |
|
156
|
|
|
self.assertEqual(bmpm('1234', '', 'ash', 'exact', False), '') |
|
157
|
|
|
|
|
158
|
|
|
# concat is true, ruleType is APPROX |
|
159
|
|
|
self.assertEqual(bmpm('Angelo', '', 'ash', 'approx', True), |
|
160
|
|
|
'angilo angYlo ongilo ongYlo Yngilo YngYlo anzilo' + |
|
161
|
|
|
' onzilo anilo onilo anxilo onxilo') |
|
162
|
|
|
self.assertEqual(bmpm('D\'Angelo', '', 'ash', 'approx', True), |
|
163
|
|
|
'dangilo dangYlo dongilo dongYlo dYngilo dYngYlo' + |
|
164
|
|
|
' danzilo donzilo danilo donilo danxilo donxilo') |
|
165
|
|
|
self.assertRaises(ValueError, bmpm, 'Angelo', 'italian,greek,spanish', |
|
166
|
|
|
'ash', 'approx', True) |
|
167
|
|
|
self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'ash', |
|
168
|
|
|
'approx', True, True), |
|
169
|
|
|
'anxYlo anxilo onxYlo onxilo angYlo angilo ongYlo' + |
|
170
|
|
|
' ongilo') |
|
171
|
|
|
self.assertEqual(bmpm('1234', '', 'ash', 'approx', True), '') |
|
172
|
|
|
|
|
173
|
|
|
# concat is false, ruleType is APPROX |
|
174
|
|
|
self.assertEqual(bmpm('Angelo', '', 'ash', 'approx', False), |
|
175
|
|
|
'angilo angYlo ongilo ongYlo Yngilo YngYlo anzilo' + |
|
176
|
|
|
' onzilo anilo onilo anxilo onxilo') |
|
177
|
|
|
self.assertEqual(bmpm('D\'Angelo', '', 'ash', 'approx', False), |
|
178
|
|
|
'dangilo dangYlo dongilo dongYlo dYngilo dYngYlo' + |
|
179
|
|
|
' danzilo donzilo danilo donilo danxilo donxilo') |
|
180
|
|
|
self.assertRaises(ValueError, bmpm, 'Angelo', 'italian,greek,spanish', |
|
181
|
|
|
'ash', 'approx', False) |
|
182
|
|
|
self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'ash', |
|
183
|
|
|
'approx', False, True), |
|
184
|
|
|
'anxYlo anxilo onxYlo onxilo angYlo angilo ongYlo' + |
|
185
|
|
|
' ongilo') |
|
186
|
|
|
self.assertEqual(bmpm('1234', '', 'ash', 'approx', False), '') |
|
187
|
|
|
|
|
188
|
|
|
# testSolrSEPHARDIC |
|
189
|
|
|
# concat is true, ruleType is EXACT |
|
190
|
|
|
self.assertEqual(bmpm('Angelo', '', 'sep', 'exact', True), |
|
191
|
|
|
'anZelo andZelo anxelo') |
|
192
|
|
|
self.assertEqual(bmpm('D\'Angelo', '', 'sep', 'exact', True), |
|
193
|
|
|
'anZelo andZelo anxelo') |
|
194
|
|
|
self.assertRaises(ValueError, bmpm, 'Angelo', 'italian,greek,spanish', |
|
195
|
|
|
'sep', 'exact', True) |
|
196
|
|
|
self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'sep', |
|
197
|
|
|
'exact', True, True), |
|
198
|
|
|
'andZelo anxelo') |
|
199
|
|
|
self.assertEqual(bmpm('1234', '', 'sep', 'exact', True), '') |
|
200
|
|
|
|
|
201
|
|
|
# concat is false, ruleType is EXACT |
|
202
|
|
|
self.assertEqual(bmpm('Angelo', '', 'sep', 'exact', False), |
|
203
|
|
|
'anZelo andZelo anxelo') |
|
204
|
|
|
self.assertEqual(bmpm('D\'Angelo', '', 'sep', 'exact', False), |
|
205
|
|
|
'anZelo andZelo anxelo') |
|
206
|
|
|
self.assertRaises(ValueError, bmpm, 'Angelo', 'italian,greek,spanish', |
|
207
|
|
|
'sep', 'exact', False) |
|
208
|
|
|
self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'sep', |
|
209
|
|
|
'exact', False, True), 'andZelo anxelo') |
|
210
|
|
|
self.assertEqual(bmpm('1234', '', 'sep', 'exact', False), '') |
|
211
|
|
|
|
|
212
|
|
|
# concat is true, ruleType is APPROX |
|
213
|
|
|
self.assertEqual(bmpm('Angelo', '', 'sep', 'approx', True), |
|
214
|
|
|
'anzila anzilu nzila nzilu anhila anhilu nhila nhilu') |
|
215
|
|
|
self.assertEqual(bmpm('D\'Angelo', '', 'sep', 'approx', True), |
|
216
|
|
|
'anzila anzilu nzila nzilu anhila anhilu nhila nhilu') |
|
217
|
|
|
self.assertRaises(ValueError, bmpm, 'Angelo', 'italian,greek,spanish', |
|
218
|
|
|
'sep', 'approx', True) |
|
219
|
|
|
self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'sep', |
|
220
|
|
|
'approx', True, True), |
|
221
|
|
|
'anzila anzilu nzila nzilu anhila anhilu nhila nhilu') |
|
222
|
|
|
self.assertEqual(bmpm('1234', '', 'sep', 'approx', True), '') |
|
223
|
|
|
|
|
224
|
|
|
# concat is false, ruleType is APPROX |
|
225
|
|
|
self.assertEqual(bmpm('Angelo', '', 'sep', 'approx', False), |
|
226
|
|
|
'anzila anzilu nzila nzilu anhila anhilu nhila nhilu') |
|
227
|
|
|
self.assertEqual(bmpm('D\'Angelo', '', 'sep', 'approx', False), |
|
228
|
|
|
'anzila anzilu nzila nzilu anhila anhilu nhila nhilu') |
|
229
|
|
|
self.assertRaises(ValueError, bmpm, 'Angelo', 'italian,greek,spanish', |
|
230
|
|
|
'sep', 'approx', False) |
|
231
|
|
|
self.assertEqual(bmpm('Angelo', 'italian,greek,spanish', 'sep', |
|
232
|
|
|
'approx', False, True), |
|
233
|
|
|
'anzila anzilu nzila nzilu anhila anhilu nhila nhilu') |
|
234
|
|
|
self.assertEqual(bmpm('1234', '', 'sep', 'approx', False), '') |
|
235
|
|
|
|
|
236
|
|
|
# testCompatibilityWithOriginalVersion |
|
237
|
|
|
self.assertEqual(bmpm('abram', '', 'gen', 'approx', False), |
|
238
|
|
|
'abram abrom avram avrom obram obrom ovram ovrom' + |
|
239
|
|
|
' Ybram Ybrom abran abron obran obron') |
|
240
|
|
|
self.assertEqual(bmpm('Bendzin', '', 'gen', 'approx', False), |
|
241
|
|
|
'binzn bindzn vindzn bintsn vintsn') |
|
242
|
|
|
self.assertEqual(bmpm('abram', '', 'ash', 'approx', False), |
|
243
|
|
|
'abram abrom avram avrom obram obrom ovram ovrom' + |
|
244
|
|
|
' Ybram Ybrom ombram ombrom imbram imbrom') |
|
245
|
|
|
self.assertEqual(bmpm('Halpern', '', 'ash', 'approx', False), |
|
246
|
|
|
'alpirn alpYrn olpirn olpYrn Ylpirn YlpYrn xalpirn' + |
|
247
|
|
|
' xolpirn') |
|
248
|
|
|
|
|
249
|
|
|
# PhoneticEngineTest |
|
250
|
|
|
self.assertEqual(bmpm('Renault', '', 'gen', 'approx', True), |
|
251
|
|
|
'rinolt rino rinDlt rinalt rinult rinD rina rinu') |
|
252
|
|
|
self.assertEqual(bmpm('Renault', '', 'ash', 'approx', True), |
|
253
|
|
|
'rinDlt rinalt rinult rYnDlt rYnalt rYnult rinolt') |
|
254
|
|
|
self.assertEqual(bmpm('Renault', '', 'sep', 'approx', True), |
|
255
|
|
|
'rinDlt') |
|
256
|
|
|
self.assertEqual(bmpm('SntJohn-Smith', '', 'gen', 'exact', True), |
|
257
|
|
|
'sntjonsmit') |
|
258
|
|
|
self.assertEqual(bmpm('d\'ortley', '', 'gen', 'exact', True), |
|
259
|
|
|
'ortlaj ortlej dortlaj dortlej') |
|
260
|
|
|
self.assertEqual(bmpm('van helsing', '', 'gen', 'exact', False), |
|
261
|
|
|
'helSink helsink helzink xelsink elSink elsink' + |
|
262
|
|
|
' vanhelsink vanhelzink vanjelsink fanhelsink' + |
|
263
|
|
|
' fanhelzink banhelsink') |
|
264
|
|
|
|
|
265
|
|
|
def test_bmpm_misc(self): |
|
266
|
|
|
"""Test abydos.phonetic.bmpm.bmpm (miscellaneous tests). |
|
267
|
|
|
|
|
268
|
|
|
The purpose of this test set is to achieve higher code coverage |
|
269
|
|
|
and to hit some of the test cases noted in the BMPM reference code. |
|
270
|
|
|
""" |
|
271
|
|
|
# test of Ashkenazi with discardable prefix |
|
272
|
|
|
self.assertEqual(bmpm('bar Hayim', name_mode='ash'), 'Dm xDm') |
|
273
|
|
|
|
|
274
|
|
|
# tests of concat behavior |
|
275
|
|
|
self.assertEqual(bmpm('Rodham Clinton', concat=False), |
|
276
|
|
|
'rodam rodom rYdam rYdom rodan rodon rodxam rodxom' + |
|
277
|
|
|
' rodxan rodxon rudam rudom klinton klnton klintun' + |
|
278
|
|
|
' klntun tzlinton tzlnton tzlintun tzlntun zlinton' + |
|
279
|
|
|
' zlnton') |
|
280
|
|
|
self.assertEqual(bmpm('Rodham Clinton', concat=True), |
|
281
|
|
|
'rodamklinton rodomklinton rodamklnton rodomklnton' + |
|
282
|
|
|
' rodamklintun rodomklintun rodamklntun rodomklntun' + |
|
283
|
|
|
' rodamtzlinton rodomtzlinton rodamtzlnton' + |
|
284
|
|
|
' rodomtzlnton rodamtzlintun rodomtzlintun' + |
|
285
|
|
|
' rodamtzlntun rodomtzlntun rodamzlinton' + |
|
286
|
|
|
' rodomzlinton rodamzlnton rodomzlnton rodanklinton' + |
|
287
|
|
|
' rodonklinton rodanklnton rodonklnton' + |
|
288
|
|
|
' rodxamklinton rodxomklinton rodxamklnton' + |
|
289
|
|
|
' rodxomklnton rodxanklinton rodxonklinton' + |
|
290
|
|
|
' rodxanklnton rodxonklnton rudamklinton' + |
|
291
|
|
|
' rudomklinton rudamklnton rudomklnton rudamklintun' + |
|
292
|
|
|
' rudomklintun rudamklntun rudomklntun' + |
|
293
|
|
|
' rudamtzlinton rudomtzlinton rudamtzlnton' + |
|
294
|
|
|
' rudomtzlnton rudamtzlintun rudomtzlintun' + |
|
295
|
|
|
' rudamtzlntun rudomtzlntun') |
|
296
|
|
|
|
|
297
|
|
|
# tests of name_mode values |
|
298
|
|
|
self.assertEqual(bmpm('bar Hayim', name_mode='ash'), 'Dm xDm') |
|
299
|
|
|
self.assertEqual(bmpm('bar Hayim', name_mode='ashkenazi'), 'Dm xDm') |
|
300
|
|
|
self.assertEqual(bmpm('bar Hayim', name_mode='Ashkenazi'), 'Dm xDm') |
|
301
|
|
|
self.assertEqual(bmpm('bar Hayim', name_mode='gen', concat=True), |
|
302
|
|
|
'barDm borDm bYrDm varDm vorDm barDn borDn barxDm' + |
|
303
|
|
|
' borxDm varxDm vorxDm barxDn borxDn') |
|
304
|
|
|
self.assertEqual(bmpm('bar Hayim', name_mode='general', concat=True), |
|
305
|
|
|
'barDm borDm bYrDm varDm vorDm barDn borDn barxDm' + |
|
306
|
|
|
' borxDm varxDm vorxDm barxDn borxDn') |
|
307
|
|
|
self.assertEqual(bmpm('bar Hayim', name_mode='Mizrahi', concat=True), |
|
308
|
|
|
'barDm borDm bYrDm varDm vorDm barDn borDn barxDm' + |
|
309
|
|
|
' borxDm varxDm vorxDm barxDn borxDn') |
|
310
|
|
|
self.assertEqual(bmpm('bar Hayim', name_mode='mizrahi', concat=True), |
|
311
|
|
|
'barDm borDm bYrDm varDm vorDm barDn borDn barxDm' + |
|
312
|
|
|
' borxDm varxDm vorxDm barxDn borxDn') |
|
313
|
|
|
self.assertEqual(bmpm('bar Hayim', name_mode='miz', concat=True), |
|
314
|
|
|
'barDm borDm bYrDm varDm vorDm barDn borDn barxDm' + |
|
315
|
|
|
' borxDm varxDm vorxDm barxDn borxDn') |
|
316
|
|
|
|
|
317
|
|
|
# test that out-of-range language_arg results in L_ANY |
|
318
|
|
|
self.assertEqual(bmpm('Rodham Clinton', language_arg=2**32), |
|
319
|
|
|
'rodam rodom rYdam rYdom rodan rodon rodxam rodxom' + |
|
320
|
|
|
' rodxan rodxon rudam rudom klinton klnton klintun' + |
|
321
|
|
|
' klntun tzlinton tzlnton tzlintun tzlntun zlinton' + |
|
322
|
|
|
' zlnton') |
|
323
|
|
|
self.assertEqual(bmpm('Rodham Clinton', language_arg=-4), |
|
324
|
|
|
'rodam rodom rYdam rYdom rodan rodon rodxam rodxom' + |
|
325
|
|
|
' rodxan rodxon rudam rudom klinton klnton klintun' + |
|
326
|
|
|
' klntun tzlinton tzlnton tzlintun tzlntun zlinton' + |
|
327
|
|
|
' zlnton') |
|
328
|
|
|
|
|
329
|
|
|
# etc. (for code coverage) |
|
330
|
|
|
self.assertEqual(bmpm('van Damme', name_mode='sep'), 'dami mi dam m') |
|
331
|
|
|
|
|
332
|
|
|
def test_bmpm_nachnamen(self): |
|
333
|
|
|
"""Test abydos.phonetic.bmpm.bmpm (Nachnamen set).""" |
|
334
|
|
|
if not ALLOW_RANDOM: |
|
335
|
|
|
return |
|
336
|
|
|
with codecs.open(_corpus_file('nachnamen.bm.csv'), |
|
337
|
|
|
encoding='utf-8') as nachnamen_testset: |
|
338
|
|
|
next(nachnamen_testset) |
|
339
|
|
|
for nn_line in nachnamen_testset: |
|
340
|
|
|
nn_line = nn_line.strip().split(',') |
|
341
|
|
|
# This test set is very large (~10000 entries) |
|
342
|
|
|
# so let's just randomly select about 20 for testing |
|
343
|
|
|
if nn_line[0] != '#' and _one_in(500): |
|
344
|
|
|
self.assertEqual(bmpm(nn_line[0], language_arg='german'), |
|
345
|
|
|
nn_line[1]) |
|
346
|
|
|
self.assertEqual(bmpm(nn_line[0]), nn_line[2]) |
|
347
|
|
|
|
|
348
|
|
|
def test_bmpm_nachnamen_cc(self): |
|
349
|
|
|
"""Test abydos.phonetic.bmpm.bmpm (Nachnamen, corner cases).""" |
|
350
|
|
|
with codecs.open(_corpus_file('nachnamen.bm.cc.csv'), |
|
351
|
|
|
encoding='utf-8') as nachnamen_testset: |
|
352
|
|
|
next(nachnamen_testset) |
|
353
|
|
|
for nn_line in nachnamen_testset: |
|
354
|
|
|
nn_line = nn_line.strip().split(',') |
|
355
|
|
|
# This test set is very large (~10000 entries) |
|
356
|
|
|
# so let's just randomly select about 20 for testing |
|
357
|
|
|
if nn_line[0] != '#': |
|
358
|
|
|
self.assertEqual(bmpm(nn_line[0], language_arg='german'), |
|
359
|
|
|
nn_line[1]) |
|
360
|
|
|
self.assertEqual(bmpm(nn_line[0]), nn_line[2]) |
|
361
|
|
|
|
|
362
|
|
|
def test_bmpm_uscensus2000(self): |
|
363
|
|
|
"""Test abydos.phonetic.bmpm.bmpm (US Census 2000 set).""" |
|
364
|
|
|
if not ALLOW_RANDOM: |
|
365
|
|
|
return |
|
366
|
|
|
with open(_corpus_file('uscensus2000.bm.csv')) as uscensus_ts: |
|
367
|
|
|
next(uscensus_ts) |
|
368
|
|
|
for cen_line in uscensus_ts: |
|
369
|
|
|
cen_line = cen_line.strip().split(',') |
|
370
|
|
|
# This test set is very large (~150000 entries) |
|
371
|
|
|
# so let's just randomly select about 20 for testing |
|
372
|
|
|
if cen_line[0] != '#' and _one_in(7500): |
|
373
|
|
|
self.assertEqual(bmpm(cen_line[0], match_mode='approx', |
|
374
|
|
|
name_mode='gen'), cen_line[1]) |
|
375
|
|
|
self.assertEqual(bmpm(cen_line[0], match_mode='approx', |
|
376
|
|
|
name_mode='ash'), cen_line[2]) |
|
377
|
|
|
self.assertEqual(bmpm(cen_line[0], match_mode='approx', |
|
378
|
|
|
name_mode='sep'), cen_line[3]) |
|
379
|
|
|
self.assertEqual(bmpm(cen_line[0], match_mode='exact', |
|
380
|
|
|
name_mode='gen'), cen_line[4]) |
|
381
|
|
|
self.assertEqual(bmpm(cen_line[0], match_mode='exact', |
|
382
|
|
|
name_mode='ash'), cen_line[5]) |
|
383
|
|
|
self.assertEqual(bmpm(cen_line[0], match_mode='exact', |
|
384
|
|
|
name_mode='sep'), cen_line[6]) |
|
385
|
|
|
|
|
386
|
|
|
def test_bmpm_uscensus2000_cc(self): |
|
387
|
|
|
"""Test abydos.phonetic.bmpm.bmpm (US Census 2000, corner cases).""" |
|
388
|
|
|
with open(_corpus_file('uscensus2000.bm.cc.csv')) as uscensus_ts: |
|
389
|
|
|
next(uscensus_ts) |
|
390
|
|
|
for cen_line in uscensus_ts: |
|
391
|
|
|
cen_line = cen_line.strip().split(',') |
|
392
|
|
|
# This test set is very large (~150000 entries) |
|
393
|
|
|
# so let's just randomly select about 20 for testing |
|
394
|
|
|
if cen_line[0] != '#' and _one_in(10): |
|
395
|
|
|
self.assertEqual(bmpm(cen_line[0], match_mode='approx', |
|
396
|
|
|
name_mode='gen'), cen_line[1]) |
|
397
|
|
|
self.assertEqual(bmpm(cen_line[0], match_mode='approx', |
|
398
|
|
|
name_mode='ash'), cen_line[2]) |
|
399
|
|
|
self.assertEqual(bmpm(cen_line[0], match_mode='approx', |
|
400
|
|
|
name_mode='sep'), cen_line[3]) |
|
401
|
|
|
self.assertEqual(bmpm(cen_line[0], match_mode='exact', |
|
402
|
|
|
name_mode='gen'), cen_line[4]) |
|
403
|
|
|
self.assertEqual(bmpm(cen_line[0], match_mode='exact', |
|
404
|
|
|
name_mode='ash'), cen_line[5]) |
|
405
|
|
|
self.assertEqual(bmpm(cen_line[0], match_mode='exact', |
|
406
|
|
|
name_mode='sep'), cen_line[6]) |
|
407
|
|
|
|
|
408
|
|
|
def test_bm_phonetic_number(self): |
|
409
|
|
|
"""Test abydos.phonetic.bmpm._bm_phonetic_number.""" |
|
410
|
|
|
self.assertEqual(_bm_phonetic_number(''), '') |
|
411
|
|
|
self.assertEqual(_bm_phonetic_number('abcd'), 'abcd') |
|
412
|
|
|
self.assertEqual(_bm_phonetic_number('abcd[123]'), 'abcd') |
|
413
|
|
|
self.assertEqual(_bm_phonetic_number('abcd[123'), 'abcd') |
|
414
|
|
|
self.assertEqual(_bm_phonetic_number('abcd['), 'abcd') |
|
415
|
|
|
self.assertEqual(_bm_phonetic_number('abcd[[[123]]]'), 'abcd') |
|
416
|
|
|
|
|
417
|
|
|
def test_bm_apply_rule_if_compat(self): |
|
418
|
|
|
"""Test abydos.phonetic.bmpm._bm_apply_rule_if_compat.""" |
|
419
|
|
|
self.assertEqual(_bm_apply_rule_if_compat('abc', 'def', 4), 'abcdef') |
|
420
|
|
|
self.assertEqual(_bm_apply_rule_if_compat('abc', 'def[6]', 4), |
|
421
|
|
|
'abcdef[4]') |
|
422
|
|
|
self.assertEqual(_bm_apply_rule_if_compat('abc', 'def[4]', 4), |
|
423
|
|
|
'abcdef[4]') |
|
424
|
|
|
self.assertEqual(_bm_apply_rule_if_compat('abc', 'def[0]', 4), None) |
|
425
|
|
|
self.assertEqual(_bm_apply_rule_if_compat('abc', 'def[8]', 4), None) |
|
426
|
|
|
self.assertEqual(_bm_apply_rule_if_compat('abc', 'def', 1), 'abcdef') |
|
427
|
|
|
self.assertEqual(_bm_apply_rule_if_compat('abc', 'def[4]', 1), |
|
428
|
|
|
'abcdef[4]') |
|
429
|
|
|
|
|
430
|
|
|
def test_bm_language(self): |
|
431
|
|
|
"""Test abydos.phonetic.bmpm._bm_language. |
|
432
|
|
|
|
|
433
|
|
|
Most test cases from: |
|
434
|
|
|
http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/LanguageGuessingTest.java?view=markup |
|
435
|
|
|
""" |
|
436
|
|
|
self.assertEqual(_bm_language('Renault', 'gen'), L_FRENCH) |
|
437
|
|
|
self.assertEqual(_bm_language('Mickiewicz', 'gen'), L_POLISH) |
|
438
|
|
|
self.assertEqual(_bm_language('Thompson', 'gen') & L_ENGLISH, |
|
439
|
|
|
L_ENGLISH) |
|
440
|
|
|
self.assertEqual(_bm_language('Nuñez', 'gen'), L_SPANISH) |
|
441
|
|
|
self.assertEqual(_bm_language('Carvalho', 'gen'), L_PORTUGUESE) |
|
442
|
|
|
self.assertEqual(_bm_language('Čapek', 'gen'), L_CZECH | L_LATVIAN) |
|
443
|
|
|
self.assertEqual(_bm_language('Sjneijder', 'gen'), L_DUTCH) |
|
444
|
|
|
self.assertEqual(_bm_language('Klausewitz', 'gen'), L_GERMAN) |
|
445
|
|
|
self.assertEqual(_bm_language('Küçük', 'gen'), L_TURKISH) |
|
446
|
|
|
self.assertEqual(_bm_language('Giacometti', 'gen'), L_ITALIAN) |
|
447
|
|
|
self.assertEqual(_bm_language('Nagy', 'gen'), L_HUNGARIAN) |
|
448
|
|
|
self.assertEqual(_bm_language('Ceauşescu', 'gen'), L_ROMANIAN) |
|
449
|
|
|
self.assertEqual(_bm_language('Angelopoulos', 'gen'), L_GREEKLATIN) |
|
450
|
|
|
self.assertEqual(_bm_language('Αγγελόπουλος', 'gen'), L_GREEK) |
|
451
|
|
|
self.assertEqual(_bm_language('Пушкин', 'gen'), L_CYRILLIC) |
|
452
|
|
|
self.assertEqual(_bm_language('כהן', 'gen'), L_HEBREW) |
|
453
|
|
|
self.assertEqual(_bm_language('ácz', 'gen'), L_ANY) |
|
454
|
|
|
self.assertEqual(_bm_language('átz', 'gen'), L_ANY) |
|
455
|
|
|
|
|
456
|
|
|
def test_bm_expand_alternates(self): |
|
457
|
|
|
"""Test abydos.phonetic.bmpm._bm_expand_alternates.""" |
|
458
|
|
|
self.assertEqual(_bm_expand_alternates(''), '') |
|
459
|
|
|
self.assertEqual(_bm_expand_alternates('aa'), 'aa') |
|
460
|
|
|
self.assertEqual(_bm_expand_alternates('aa|bb'), 'aa|bb') |
|
461
|
|
|
self.assertEqual(_bm_expand_alternates('aa|aa'), 'aa|aa') |
|
462
|
|
|
|
|
463
|
|
|
self.assertEqual(_bm_expand_alternates('(aa)(bb)'), 'aabb') |
|
464
|
|
|
self.assertEqual(_bm_expand_alternates('(aa)(bb[0])'), '') |
|
465
|
|
|
self.assertEqual(_bm_expand_alternates('(aa)(bb[4])'), 'aabb[4]') |
|
466
|
|
|
self.assertEqual(_bm_expand_alternates('(aa[0])(bb)'), '') |
|
467
|
|
|
self.assertEqual(_bm_expand_alternates('(aa[4])(bb)'), 'aabb[4]') |
|
468
|
|
|
|
|
469
|
|
|
self.assertEqual(_bm_expand_alternates('(a|b|c)(a|b|c)'), |
|
470
|
|
|
'aa|ab|ac|ba|bb|bc|ca|cb|cc') |
|
471
|
|
|
self.assertEqual(_bm_expand_alternates('(a[1]|b[2])(c|d)'), |
|
472
|
|
|
'ac[1]|ad[1]|bc[2]|bd[2]') |
|
473
|
|
|
self.assertEqual(_bm_expand_alternates('(a[1]|b[2])(c[4]|d)'), |
|
474
|
|
|
'ad[1]|bd[2]') |
|
475
|
|
|
|
|
476
|
|
|
def test_bm_remove_dupes(self): |
|
477
|
|
|
"""Test abydos.phonetic.bmpm._bm_remove_dupes.""" |
|
478
|
|
|
self.assertEqual(_bm_remove_dupes(''), '') |
|
479
|
|
|
self.assertEqual(_bm_remove_dupes('aa'), 'aa') |
|
480
|
|
|
self.assertEqual(_bm_remove_dupes('aa|bb'), 'aa|bb') |
|
481
|
|
|
self.assertEqual(_bm_remove_dupes('aa|aa'), 'aa') |
|
482
|
|
|
self.assertEqual(_bm_remove_dupes('aa|aa|aa|bb|aa'), 'aa|bb') |
|
483
|
|
|
self.assertEqual(_bm_remove_dupes('bb|aa|bb|aa|bb'), 'bb|aa') |
|
484
|
|
|
|
|
485
|
|
|
def test_bm_normalize_lang_attrs(self): |
|
486
|
|
|
"""Test abydos.phonetic.bmpm._bm_normalize_language_attributes.""" |
|
487
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('', False), '') |
|
488
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('', True), '') |
|
489
|
|
|
|
|
490
|
|
|
self.assertRaises(ValueError, _bm_normalize_lang_attrs, 'a[1', False) |
|
491
|
|
|
self.assertRaises(ValueError, _bm_normalize_lang_attrs, 'a[1', True) |
|
492
|
|
|
|
|
493
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('abc', False), 'abc') |
|
494
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('abc[0]', False), '[0]') |
|
495
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('abc[2]', False), 'abc[2]') |
|
496
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('abc[2][4]', False), '[0]') |
|
497
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('abc[2][6]', False), |
|
498
|
|
|
'abc[2]') |
|
499
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('ab[2]c[4]', False), '[0]') |
|
500
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('ab[2]c[6]', False), |
|
501
|
|
|
'abc[2]') |
|
502
|
|
|
|
|
503
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('abc', True), 'abc') |
|
504
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('abc[0]', True), 'abc') |
|
505
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('abc[2]', True), 'abc') |
|
506
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('abc[2][4]', True), 'abc') |
|
507
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('abc[2][6]', True), 'abc') |
|
508
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('ab[2]c[4]', True), 'abc') |
|
509
|
|
|
self.assertEqual(_bm_normalize_lang_attrs('ab[2]c[6]', True), 'abc') |
|
510
|
|
|
|
|
511
|
|
|
|
|
512
|
|
|
if __name__ == '__main__': |
|
513
|
|
|
unittest.main() |
|
514
|
|
|
|