Conditions | 142 |
Total Lines | 1631 |
Code Lines | 1375 |
Lines | 0 |
Ratio | 0 % |
Tests | 272 |
CRAP Score | 142 |
Changes | 0 |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like abydos.phonetic._phonet.phonet() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | # -*- coding: utf-8 -*- |
||
36 | 1 | def phonet(word, mode=1, lang='de'): |
|
37 | """Return the phonet code for a word. |
||
38 | |||
39 | phonet ("Hannoveraner Phonetik") was developed by Jörg Michael and |
||
40 | documented in :cite:`Michael:1999`. |
||
41 | |||
42 | This is a port of Jesper Zedlitz's code, which is licensed LGPL |
||
43 | :cite:`Zedlitz:2015`. |
||
44 | |||
45 | That is, in turn, based on Michael's C code, which is also licensed LGPL |
||
46 | :cite:`Michael:2007`. |
||
47 | |||
48 | :param str word: the word to transform |
||
49 | :param int mode: the ponet variant to employ (1 or 2) |
||
50 | :param str lang: 'de' (default) for German |
||
51 | 'none' for no language |
||
52 | :returns: the phonet value |
||
53 | :rtype: str |
||
54 | |||
55 | >>> phonet('Christopher') |
||
56 | 'KRISTOFA' |
||
57 | >>> phonet('Niall') |
||
58 | 'NIAL' |
||
59 | >>> phonet('Smith') |
||
60 | 'SMIT' |
||
61 | >>> phonet('Schmidt') |
||
62 | 'SHMIT' |
||
63 | |||
64 | >>> phonet('Christopher', mode=2) |
||
65 | 'KRIZTUFA' |
||
66 | >>> phonet('Niall', mode=2) |
||
67 | 'NIAL' |
||
68 | >>> phonet('Smith', mode=2) |
||
69 | 'ZNIT' |
||
70 | >>> phonet('Schmidt', mode=2) |
||
71 | 'ZNIT' |
||
72 | |||
73 | >>> phonet('Christopher', lang='none') |
||
74 | 'CHRISTOPHER' |
||
75 | >>> phonet('Niall', lang='none') |
||
76 | 'NIAL' |
||
77 | >>> phonet('Smith', lang='none') |
||
78 | 'SMITH' |
||
79 | >>> phonet('Schmidt', lang='none') |
||
80 | 'SCHMIDT' |
||
81 | """ |
||
82 | 1 | _phonet_rules_no_lang = ( # separator chars |
|
83 | # fmt: off |
||
84 | '´', ' ', ' ', |
||
85 | '"', ' ', ' ', |
||
86 | '`$', '', '', |
||
87 | '\'', ' ', ' ', |
||
88 | ',', ',', ',', |
||
89 | ';', ',', ',', |
||
90 | '-', ' ', ' ', |
||
91 | ' ', ' ', ' ', |
||
92 | '.', '.', '.', |
||
93 | ':', '.', '.', |
||
94 | # German umlauts |
||
95 | 'Ä', 'AE', 'AE', |
||
96 | 'Ö', 'OE', 'OE', |
||
97 | 'Ü', 'UE', 'UE', |
||
98 | 'ß', 'S', 'S', |
||
99 | # international umlauts |
||
100 | 'À', 'A', 'A', |
||
101 | 'Á', 'A', 'A', |
||
102 | 'Â', 'A', 'A', |
||
103 | 'Ã', 'A', 'A', |
||
104 | 'Å', 'A', 'A', |
||
105 | 'Æ', 'AE', 'AE', |
||
106 | 'Ç', 'C', 'C', |
||
107 | 'Ð', 'DJ', 'DJ', |
||
108 | 'È', 'E', 'E', |
||
109 | 'É', 'E', 'E', |
||
110 | 'Ê', 'E', 'E', |
||
111 | 'Ë', 'E', 'E', |
||
112 | 'Ì', 'I', 'I', |
||
113 | 'Í', 'I', 'I', |
||
114 | 'Î', 'I', 'I', |
||
115 | 'Ï', 'I', 'I', |
||
116 | 'Ñ', 'NH', 'NH', |
||
117 | 'Ò', 'O', 'O', |
||
118 | 'Ó', 'O', 'O', |
||
119 | 'Ô', 'O', 'O', |
||
120 | 'Õ', 'O', 'O', |
||
121 | 'Œ', 'OE', 'OE', |
||
122 | 'Ø', 'OE', 'OE', |
||
123 | 'Š', 'SH', 'SH', |
||
124 | 'Þ', 'TH', 'TH', |
||
125 | 'Ù', 'U', 'U', |
||
126 | 'Ú', 'U', 'U', |
||
127 | 'Û', 'U', 'U', |
||
128 | 'Ý', 'Y', 'Y', |
||
129 | 'Ÿ', 'Y', 'Y', |
||
130 | # 'normal' letters (A-Z) |
||
131 | 'MC^', 'MAC', 'MAC', |
||
132 | 'MC^', 'MAC', 'MAC', |
||
133 | 'M´^', 'MAC', 'MAC', |
||
134 | 'M\'^', 'MAC', 'MAC', |
||
135 | 'O´^', 'O', 'O', |
||
136 | 'O\'^', 'O', 'O', |
||
137 | 'VAN DEN ^', 'VANDEN', 'VANDEN', |
||
138 | None, None, None |
||
139 | # fmt: on |
||
140 | ) |
||
141 | |||
142 | 1 | _phonet_rules_german = ( # separator chars |
|
143 | # fmt: off |
||
144 | '´', ' ', ' ', |
||
145 | '"', ' ', ' ', |
||
146 | '`$', '', '', |
||
147 | '\'', ' ', ' ', |
||
148 | ',', ' ', ' ', |
||
149 | ';', ' ', ' ', |
||
150 | '-', ' ', ' ', |
||
151 | ' ', ' ', ' ', |
||
152 | '.', '.', '.', |
||
153 | ':', '.', '.', |
||
154 | # German umlauts |
||
155 | 'ÄE', 'E', 'E', |
||
156 | 'ÄU<', 'EU', 'EU', |
||
157 | 'ÄV(AEOU)-<', 'EW', None, |
||
158 | 'Ä$', 'Ä', None, |
||
159 | 'Ä<', None, 'E', |
||
160 | 'Ä', 'E', None, |
||
161 | 'ÖE', 'Ö', 'Ö', |
||
162 | 'ÖU', 'Ö', 'Ö', |
||
163 | 'ÖVER--<', 'ÖW', None, |
||
164 | 'ÖV(AOU)-', 'ÖW', None, |
||
165 | 'ÜBEL(GNRW)-^^', 'ÜBL ', 'IBL ', |
||
166 | 'ÜBER^^', 'ÜBA', 'IBA', |
||
167 | 'ÜE', 'Ü', 'I', |
||
168 | 'ÜVER--<', 'ÜW', None, |
||
169 | 'ÜV(AOU)-', 'ÜW', None, |
||
170 | 'Ü', None, 'I', |
||
171 | 'ßCH<', None, 'Z', |
||
172 | 'ß<', 'S', 'Z', |
||
173 | # international umlauts |
||
174 | 'À<', 'A', 'A', |
||
175 | 'Á<', 'A', 'A', |
||
176 | 'Â<', 'A', 'A', |
||
177 | 'Ã<', 'A', 'A', |
||
178 | 'Å<', 'A', 'A', |
||
179 | 'ÆER-', 'E', 'E', |
||
180 | 'ÆU<', 'EU', 'EU', |
||
181 | 'ÆV(AEOU)-<', 'EW', None, |
||
182 | 'Æ$', 'Ä', None, |
||
183 | 'Æ<', None, 'E', |
||
184 | 'Æ', 'E', None, |
||
185 | 'Ç', 'Z', 'Z', |
||
186 | 'ÐÐ-', '', '', |
||
187 | 'Ð', 'DI', 'TI', |
||
188 | 'È<', 'E', 'E', |
||
189 | 'É<', 'E', 'E', |
||
190 | 'Ê<', 'E', 'E', |
||
191 | 'Ë', 'E', 'E', |
||
192 | 'Ì<', 'I', 'I', |
||
193 | 'Í<', 'I', 'I', |
||
194 | 'Î<', 'I', 'I', |
||
195 | 'Ï', 'I', 'I', |
||
196 | 'ÑÑ-', '', '', |
||
197 | 'Ñ', 'NI', 'NI', |
||
198 | 'Ò<', 'O', 'U', |
||
199 | 'Ó<', 'O', 'U', |
||
200 | 'Ô<', 'O', 'U', |
||
201 | 'Õ<', 'O', 'U', |
||
202 | 'Œ<', 'Ö', 'Ö', |
||
203 | 'Ø(IJY)-<', 'E', 'E', |
||
204 | 'Ø<', 'Ö', 'Ö', |
||
205 | 'Š', 'SH', 'Z', |
||
206 | 'Þ', 'T', 'T', |
||
207 | 'Ù<', 'U', 'U', |
||
208 | 'Ú<', 'U', 'U', |
||
209 | 'Û<', 'U', 'U', |
||
210 | 'Ý<', 'I', 'I', |
||
211 | 'Ÿ<', 'I', 'I', |
||
212 | # 'normal' letters (A-Z) |
||
213 | 'ABELLE$', 'ABL', 'ABL', |
||
214 | 'ABELL$', 'ABL', 'ABL', |
||
215 | 'ABIENNE$', 'ABIN', 'ABIN', |
||
216 | 'ACHME---^', 'ACH', 'AK', |
||
217 | 'ACEY$', 'AZI', 'AZI', |
||
218 | 'ADV', 'ATW', None, |
||
219 | 'AEGL-', 'EK', None, |
||
220 | 'AEU<', 'EU', 'EU', |
||
221 | 'AE2', 'E', 'E', |
||
222 | 'AFTRAUBEN------', 'AFT ', 'AFT ', |
||
223 | 'AGL-1', 'AK', None, |
||
224 | 'AGNI-^', 'AKN', 'AKN', |
||
225 | 'AGNIE-', 'ANI', 'ANI', |
||
226 | 'AGN(AEOU)-$', 'ANI', 'ANI', |
||
227 | 'AH(AIOÖUÜY)-', 'AH', None, |
||
228 | 'AIA2', 'AIA', 'AIA', |
||
229 | 'AIE$', 'E', 'E', |
||
230 | 'AILL(EOU)-', 'ALI', 'ALI', |
||
231 | 'AINE$', 'EN', 'EN', |
||
232 | 'AIRE$', 'ER', 'ER', |
||
233 | 'AIR-', 'E', 'E', |
||
234 | 'AISE$', 'ES', 'EZ', |
||
235 | 'AISSANCE$', 'ESANS', 'EZANZ', |
||
236 | 'AISSE$', 'ES', 'EZ', |
||
237 | 'AIX$', 'EX', 'EX', |
||
238 | 'AJ(AÄEÈÉÊIOÖUÜ)--', 'A', 'A', |
||
239 | 'AKTIE', 'AXIE', 'AXIE', |
||
240 | 'AKTUEL', 'AKTUEL', None, |
||
241 | 'ALOI^', 'ALOI', 'ALUI', # Don't merge these rules |
||
242 | 'ALOY^', 'ALOI', 'ALUI', # needed by 'check_rules' |
||
243 | 'AMATEU(RS)-', 'AMATÖ', 'ANATÖ', |
||
244 | 'ANCH(OEI)-', 'ANSH', 'ANZ', |
||
245 | 'ANDERGEGANG----', 'ANDA GE', 'ANTA KE', |
||
246 | 'ANDERGEHE----', 'ANDA ', 'ANTA ', |
||
247 | 'ANDERGESETZ----', 'ANDA GE', 'ANTA KE', |
||
248 | 'ANDERGING----', 'ANDA ', 'ANTA ', |
||
249 | 'ANDERSETZ(ET)-----', 'ANDA ', 'ANTA ', |
||
250 | 'ANDERZUGEHE----', 'ANDA ZU ', 'ANTA ZU ', |
||
251 | 'ANDERZUSETZE-----', 'ANDA ZU ', 'ANTA ZU ', |
||
252 | 'ANER(BKO)---^^', 'AN', None, |
||
253 | 'ANHAND---^$', 'AN H', 'AN ', |
||
254 | 'ANH(AÄEIOÖUÜY)--^^', 'AN', None, |
||
255 | 'ANIELLE$', 'ANIEL', 'ANIL', |
||
256 | 'ANIEL', 'ANIEL', None, |
||
257 | 'ANSTELLE----^$', 'AN ST', 'AN ZT', |
||
258 | 'ANTI^^', 'ANTI', 'ANTI', |
||
259 | 'ANVER^^', 'ANFA', 'ANFA', |
||
260 | 'ATIA$', 'ATIA', 'ATIA', |
||
261 | 'ATIA(NS)--', 'ATI', 'ATI', |
||
262 | 'ATI(AÄOÖUÜ)-', 'AZI', 'AZI', |
||
263 | 'AUAU--', '', '', |
||
264 | 'AUERE$', 'AUERE', None, |
||
265 | 'AUERE(NS)-$', 'AUERE', None, |
||
266 | 'AUERE(AIOUY)--', 'AUER', None, |
||
267 | 'AUER(AÄIOÖUÜY)-', 'AUER', None, |
||
268 | 'AUER<', 'AUA', 'AUA', |
||
269 | 'AUF^^', 'AUF', 'AUF', |
||
270 | 'AULT$', 'O', 'U', |
||
271 | 'AUR(BCDFGKLMNQSTVWZ)-', 'AUA', 'AUA', |
||
272 | 'AUR$', 'AUA', 'AUA', |
||
273 | 'AUSSE$', 'OS', 'UZ', |
||
274 | 'AUS(ST)-^', 'AUS', 'AUS', |
||
275 | 'AUS^^', 'AUS', 'AUS', |
||
276 | 'AUTOFAHR----', 'AUTO ', 'AUTU ', |
||
277 | 'AUTO^^', 'AUTO', 'AUTU', |
||
278 | 'AUX(IY)-', 'AUX', 'AUX', |
||
279 | 'AUX', 'O', 'U', |
||
280 | 'AU', 'AU', 'AU', |
||
281 | 'AVER--<', 'AW', None, |
||
282 | 'AVIER$', 'AWIE', 'AFIE', |
||
283 | 'AV(EÈÉÊI)-^', 'AW', None, |
||
284 | 'AV(AOU)-', 'AW', None, |
||
285 | 'AYRE$', 'EIRE', 'EIRE', |
||
286 | 'AYRE(NS)-$', 'EIRE', 'EIRE', |
||
287 | 'AYRE(AIOUY)--', 'EIR', 'EIR', |
||
288 | 'AYR(AÄIOÖUÜY)-', 'EIR', 'EIR', |
||
289 | 'AYR<', 'EIA', 'EIA', |
||
290 | 'AYER--<', 'EI', 'EI', |
||
291 | 'AY(AÄEIOÖUÜY)--', 'A', 'A', |
||
292 | 'AË', 'E', 'E', |
||
293 | 'A(IJY)<', 'EI', 'EI', |
||
294 | 'BABY^$', 'BEBI', 'BEBI', |
||
295 | 'BAB(IY)^', 'BEBI', 'BEBI', |
||
296 | 'BEAU^$', 'BO', None, |
||
297 | 'BEA(BCMNRU)-^', 'BEA', 'BEA', |
||
298 | 'BEAT(AEIMORU)-^', 'BEAT', 'BEAT', |
||
299 | 'BEE$', 'BI', 'BI', |
||
300 | 'BEIGE^$', 'BESH', 'BEZ', |
||
301 | 'BENOIT--', 'BENO', 'BENU', |
||
302 | 'BER(DT)-', 'BER', None, |
||
303 | 'BERN(DT)-', 'BERN', None, |
||
304 | 'BE(LMNRST)-^', 'BE', 'BE', |
||
305 | 'BETTE$', 'BET', 'BET', |
||
306 | 'BEVOR^$', 'BEFOR', None, |
||
307 | 'BIC$', 'BIZ', 'BIZ', |
||
308 | 'BOWL(EI)-', 'BOL', 'BUL', |
||
309 | 'BP(AÄEÈÉÊIÌÍÎOÖRUÜY)-', 'B', 'B', |
||
310 | 'BRINGEND-----^', 'BRI', 'BRI', |
||
311 | 'BRINGEND-----', ' BRI', ' BRI', |
||
312 | 'BROW(NS)-', 'BRAU', 'BRAU', |
||
313 | 'BUDGET7', 'BÜGE', 'BIKE', |
||
314 | 'BUFFET7', 'BÜFE', 'BIFE', |
||
315 | 'BYLLE$', 'BILE', 'BILE', |
||
316 | 'BYLL$', 'BIL', 'BIL', |
||
317 | 'BYPA--^', 'BEI', 'BEI', |
||
318 | 'BYTE<', 'BEIT', 'BEIT', |
||
319 | 'BY9^', 'BÜ', None, |
||
320 | 'B(SßZ)$', 'BS', None, |
||
321 | 'CACH(EI)-^', 'KESH', 'KEZ', |
||
322 | 'CAE--', 'Z', 'Z', |
||
323 | 'CA(IY)$', 'ZEI', 'ZEI', |
||
324 | 'CE(EIJUY)--', 'Z', 'Z', |
||
325 | 'CENT<', 'ZENT', 'ZENT', |
||
326 | 'CERST(EI)----^', 'KE', 'KE', |
||
327 | 'CER$', 'ZA', 'ZA', |
||
328 | 'CE3', 'ZE', 'ZE', |
||
329 | 'CH\'S$', 'X', 'X', |
||
330 | 'CH´S$', 'X', 'X', |
||
331 | 'CHAO(ST)-', 'KAO', 'KAU', |
||
332 | 'CHAMPIO-^', 'SHEMPI', 'ZENBI', |
||
333 | 'CHAR(AI)-^', 'KAR', 'KAR', |
||
334 | 'CHAU(CDFSVWXZ)-', 'SHO', 'ZU', |
||
335 | 'CHÄ(CF)-', 'SHE', 'ZE', |
||
336 | 'CHE(CF)-', 'SHE', 'ZE', |
||
337 | 'CHEM-^', 'KE', 'KE', # or: 'CHE', 'KE' |
||
338 | 'CHEQUE<', 'SHEK', 'ZEK', |
||
339 | 'CHI(CFGPVW)-', 'SHI', 'ZI', |
||
340 | 'CH(AEUY)-<^', 'SH', 'Z', |
||
341 | 'CHK-', '', '', |
||
342 | 'CHO(CKPS)-^', 'SHO', 'ZU', |
||
343 | 'CHRIS-', 'KRI', None, |
||
344 | 'CHRO-', 'KR', None, |
||
345 | 'CH(LOR)-<^', 'K', 'K', |
||
346 | 'CHST-', 'X', 'X', |
||
347 | 'CH(SßXZ)3', 'X', 'X', |
||
348 | 'CHTNI-3', 'CHN', 'KN', |
||
349 | 'CH^', 'K', 'K', # or: 'CH', 'K' |
||
350 | 'CH', 'CH', 'K', |
||
351 | 'CIC$', 'ZIZ', 'ZIZ', |
||
352 | 'CIENCEFICT----', 'EIENS ', 'EIENZ ', |
||
353 | 'CIENCE$', 'EIENS', 'EIENZ', |
||
354 | 'CIER$', 'ZIE', 'ZIE', |
||
355 | 'CYB-^', 'ZEI', 'ZEI', |
||
356 | 'CY9^', 'ZÜ', 'ZI', |
||
357 | 'C(IJY)-<3', 'Z', 'Z', |
||
358 | 'CLOWN-', 'KLAU', 'KLAU', |
||
359 | 'CCH', 'Z', 'Z', |
||
360 | 'CCE-', 'X', 'X', |
||
361 | 'C(CK)-', '', '', |
||
362 | 'CLAUDET---', 'KLO', 'KLU', |
||
363 | 'CLAUDINE^$', 'KLODIN', 'KLUTIN', |
||
364 | 'COACH', 'KOSH', 'KUZ', |
||
365 | 'COLE$', 'KOL', 'KUL', |
||
366 | 'COUCH', 'KAUSH', 'KAUZ', |
||
367 | 'COW', 'KAU', 'KAU', |
||
368 | 'CQUES$', 'K', 'K', |
||
369 | 'CQUE', 'K', 'K', |
||
370 | 'CRASH--9', 'KRE', 'KRE', |
||
371 | 'CREAT-^', 'KREA', 'KREA', |
||
372 | 'CST', 'XT', 'XT', |
||
373 | 'CS<^', 'Z', 'Z', |
||
374 | 'C(SßX)', 'X', 'X', |
||
375 | 'CT\'S$', 'X', 'X', |
||
376 | 'CT(SßXZ)', 'X', 'X', |
||
377 | 'CZ<', 'Z', 'Z', |
||
378 | 'C(ÈÉÊÌÍÎÝ)3', 'Z', 'Z', |
||
379 | 'C.^', 'C.', 'C.', |
||
380 | 'CÄ-', 'Z', 'Z', |
||
381 | 'CÜ$', 'ZÜ', 'ZI', |
||
382 | 'C\'S$', 'X', 'X', |
||
383 | 'C<', 'K', 'K', |
||
384 | 'DAHER^$', 'DAHER', None, |
||
385 | 'DARAUFFOLGE-----', 'DARAUF ', 'TARAUF ', |
||
386 | 'DAVO(NR)-^$', 'DAFO', 'TAFU', |
||
387 | 'DD(SZ)--<', '', '', |
||
388 | 'DD9', 'D', None, |
||
389 | 'DEPOT7', 'DEPO', 'TEBU', |
||
390 | 'DESIGN', 'DISEIN', 'TIZEIN', |
||
391 | 'DE(LMNRST)-3^', 'DE', 'TE', |
||
392 | 'DETTE$', 'DET', 'TET', |
||
393 | 'DH$', 'T', None, |
||
394 | 'DIC$', 'DIZ', 'TIZ', |
||
395 | 'DIDR-^', 'DIT', None, |
||
396 | 'DIEDR-^', 'DIT', None, |
||
397 | 'DJ(AEIOU)-^', 'I', 'I', |
||
398 | 'DMITR-^', 'DIMIT', 'TINIT', |
||
399 | 'DRY9^', 'DRÜ', None, |
||
400 | 'DT-', '', '', |
||
401 | 'DUIS-^', 'DÜ', 'TI', |
||
402 | 'DURCH^^', 'DURCH', 'TURK', |
||
403 | 'DVA$', 'TWA', None, |
||
404 | 'DY9^', 'DÜ', None, |
||
405 | 'DYS$', 'DIS', None, |
||
406 | 'DS(CH)--<', 'T', 'T', |
||
407 | 'DST', 'ZT', 'ZT', |
||
408 | 'DZS(CH)--', 'T', 'T', |
||
409 | 'D(SßZ)', 'Z', 'Z', |
||
410 | 'D(AÄEIOÖRUÜY)-', 'D', None, |
||
411 | 'D(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'D', None, |
||
412 | 'D\'H^', 'D', 'T', |
||
413 | 'D´H^', 'D', 'T', |
||
414 | 'D`H^', 'D', 'T', |
||
415 | 'D\'S3$', 'Z', 'Z', |
||
416 | 'D´S3$', 'Z', 'Z', |
||
417 | 'D^', 'D', None, |
||
418 | 'D', 'T', 'T', |
||
419 | 'EAULT$', 'O', 'U', |
||
420 | 'EAUX$', 'O', 'U', |
||
421 | 'EAU', 'O', 'U', |
||
422 | 'EAV', 'IW', 'IF', |
||
423 | 'EAS3$', 'EAS', None, |
||
424 | 'EA(AÄEIOÖÜY)-3', 'EA', 'EA', |
||
425 | 'EA3$', 'EA', 'EA', |
||
426 | 'EA3', 'I', 'I', |
||
427 | 'EBENSO^$', 'EBNSO', 'EBNZU', |
||
428 | 'EBENSO^^', 'EBNSO ', 'EBNZU ', |
||
429 | 'EBEN^^', 'EBN', 'EBN', |
||
430 | 'EE9', 'E', 'E', |
||
431 | 'EGL-1', 'EK', None, |
||
432 | 'EHE(IUY)--1', 'EH', None, |
||
433 | 'EHUNG---1', 'E', None, |
||
434 | 'EH(AÄIOÖUÜY)-1', 'EH', None, |
||
435 | 'EIEI--', '', '', |
||
436 | 'EIERE^$', 'EIERE', None, |
||
437 | 'EIERE$', 'EIERE', None, |
||
438 | 'EIERE(NS)-$', 'EIERE', None, |
||
439 | 'EIERE(AIOUY)--', 'EIER', None, |
||
440 | 'EIER(AÄIOÖUÜY)-', 'EIER', None, |
||
441 | 'EIER<', 'EIA', None, |
||
442 | 'EIGL-1', 'EIK', None, |
||
443 | 'EIGH$', 'EI', 'EI', |
||
444 | 'EIH--', 'E', 'E', |
||
445 | 'EILLE$', 'EI', 'EI', |
||
446 | 'EIR(BCDFGKLMNQSTVWZ)-', 'EIA', 'EIA', |
||
447 | 'EIR$', 'EIA', 'EIA', |
||
448 | 'EITRAUBEN------', 'EIT ', 'EIT ', |
||
449 | 'EI', 'EI', 'EI', |
||
450 | 'EJ$', 'EI', 'EI', |
||
451 | 'ELIZ^', 'ELIS', None, |
||
452 | 'ELZ^', 'ELS', None, |
||
453 | 'EL-^', 'E', 'E', |
||
454 | 'ELANG----1', 'E', 'E', |
||
455 | 'EL(DKL)--1', 'E', 'E', |
||
456 | 'EL(MNT)--1$', 'E', 'E', |
||
457 | 'ELYNE$', 'ELINE', 'ELINE', |
||
458 | 'ELYN$', 'ELIN', 'ELIN', |
||
459 | 'EL(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'EL', 'EL', |
||
460 | 'EL-1', 'L', 'L', |
||
461 | 'EM-^', None, 'E', |
||
462 | 'EM(DFKMPQT)--1', None, 'E', |
||
463 | 'EM(AÄEÈÉÊIÌÍÎOÖUÜY)--1', None, 'E', |
||
464 | 'EM-1', None, 'N', |
||
465 | 'ENGAG-^', 'ANGA', 'ANKA', |
||
466 | 'EN-^', 'E', 'E', |
||
467 | 'ENTUEL', 'ENTUEL', None, |
||
468 | 'EN(CDGKQSTZ)--1', 'E', 'E', |
||
469 | 'EN(AÄEÈÉÊIÌÍÎNOÖUÜY)-1', 'EN', 'EN', |
||
470 | 'EN-1', '', '', |
||
471 | 'ERH(AÄEIOÖUÜ)-^', 'ERH', 'ER', |
||
472 | 'ER-^', 'E', 'E', |
||
473 | 'ERREGEND-----', ' ER', ' ER', |
||
474 | 'ERT1$', 'AT', None, |
||
475 | 'ER(DGLKMNRQTZß)-1', 'ER', None, |
||
476 | 'ER(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'ER', 'A', |
||
477 | 'ER1$', 'A', 'A', |
||
478 | 'ER<1', 'A', 'A', |
||
479 | 'ETAT7', 'ETA', 'ETA', |
||
480 | 'ETI(AÄOÖÜU)-', 'EZI', 'EZI', |
||
481 | 'EUERE$', 'EUERE', None, |
||
482 | 'EUERE(NS)-$', 'EUERE', None, |
||
483 | 'EUERE(AIOUY)--', 'EUER', None, |
||
484 | 'EUER(AÄIOÖUÜY)-', 'EUER', None, |
||
485 | 'EUER<', 'EUA', None, |
||
486 | 'EUEU--', '', '', |
||
487 | 'EUILLE$', 'Ö', 'Ö', |
||
488 | 'EUR$', 'ÖR', 'ÖR', |
||
489 | 'EUX', 'Ö', 'Ö', |
||
490 | 'EUSZ$', 'EUS', None, |
||
491 | 'EUTZ$', 'EUS', None, |
||
492 | 'EUYS$', 'EUS', 'EUZ', |
||
493 | 'EUZ$', 'EUS', None, |
||
494 | 'EU', 'EU', 'EU', |
||
495 | 'EVER--<1', 'EW', None, |
||
496 | 'EV(ÄOÖUÜ)-1', 'EW', None, |
||
497 | 'EYER<', 'EIA', 'EIA', |
||
498 | 'EY<', 'EI', 'EI', |
||
499 | 'FACETTE', 'FASET', 'FAZET', |
||
500 | 'FANS--^$', 'FE', 'FE', |
||
501 | 'FAN-^$', 'FE', 'FE', |
||
502 | 'FAULT-', 'FOL', 'FUL', |
||
503 | 'FEE(DL)-', 'FI', 'FI', |
||
504 | 'FEHLER', 'FELA', 'FELA', |
||
505 | 'FE(LMNRST)-3^', 'FE', 'FE', |
||
506 | 'FOERDERN---^', 'FÖRD', 'FÖRT', |
||
507 | 'FOERDERN---', ' FÖRD', ' FÖRT', |
||
508 | 'FOND7', 'FON', 'FUN', |
||
509 | 'FRAIN$', 'FRA', 'FRA', |
||
510 | 'FRISEU(RS)-', 'FRISÖ', 'FRIZÖ', |
||
511 | 'FY9^', 'FÜ', None, |
||
512 | 'FÖRDERN---^', 'FÖRD', 'FÖRT', |
||
513 | 'FÖRDERN---', ' FÖRD', ' FÖRT', |
||
514 | 'GAGS^$', 'GEX', 'KEX', |
||
515 | 'GAG^$', 'GEK', 'KEK', |
||
516 | 'GD', 'KT', 'KT', |
||
517 | 'GEGEN^^', 'GEGN', 'KEKN', |
||
518 | 'GEGENGEKOM-----', 'GEGN ', 'KEKN ', |
||
519 | 'GEGENGESET-----', 'GEGN ', 'KEKN ', |
||
520 | 'GEGENKOMME-----', 'GEGN ', 'KEKN ', |
||
521 | 'GEGENZUKOM---', 'GEGN ZU ', 'KEKN ZU ', |
||
522 | 'GENDETWAS-----$', 'GENT ', 'KENT ', |
||
523 | 'GENRE', 'IORE', 'IURE', |
||
524 | 'GE(LMNRST)-3^', 'GE', 'KE', |
||
525 | 'GER(DKT)-', 'GER', None, |
||
526 | 'GETTE$', 'GET', 'KET', |
||
527 | 'GGF.', 'GF.', None, |
||
528 | 'GG-', '', '', |
||
529 | 'GH', 'G', None, |
||
530 | 'GI(AOU)-^', 'I', 'I', |
||
531 | 'GION-3', 'KIO', 'KIU', |
||
532 | 'G(CK)-', '', '', |
||
533 | 'GJ(AEIOU)-^', 'I', 'I', |
||
534 | 'GMBH^$', 'GMBH', 'GMBH', |
||
535 | 'GNAC$', 'NIAK', 'NIAK', |
||
536 | 'GNON$', 'NION', 'NIUN', |
||
537 | 'GN$', 'N', 'N', |
||
538 | 'GONCAL-^', 'GONZA', 'KUNZA', |
||
539 | 'GRY9^', 'GRÜ', None, |
||
540 | 'G(SßXZ)-<', 'K', 'K', |
||
541 | 'GUCK-', 'KU', 'KU', |
||
542 | 'GUISEP-^', 'IUSE', 'IUZE', |
||
543 | 'GUI-^', 'G', 'K', |
||
544 | 'GUTAUSSEH------^', 'GUT ', 'KUT ', |
||
545 | 'GUTGEHEND------^', 'GUT ', 'KUT ', |
||
546 | 'GY9^', 'GÜ', None, |
||
547 | 'G(AÄEILOÖRUÜY)-', 'G', None, |
||
548 | 'G(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'G', None, |
||
549 | 'G\'S$', 'X', 'X', |
||
550 | 'G´S$', 'X', 'X', |
||
551 | 'G^', 'G', None, |
||
552 | 'G', 'K', 'K', |
||
553 | 'HA(HIUY)--1', 'H', None, |
||
554 | 'HANDVOL---^', 'HANT ', 'ANT ', |
||
555 | 'HANNOVE-^', 'HANOF', None, |
||
556 | 'HAVEN7$', 'HAFN', None, |
||
557 | 'HEAD-', 'HE', 'E', |
||
558 | 'HELIEGEN------', 'E ', 'E ', |
||
559 | 'HESTEHEN------', 'E ', 'E ', |
||
560 | 'HE(LMNRST)-3^', 'HE', 'E', |
||
561 | 'HE(LMN)-1', 'E', 'E', |
||
562 | 'HEUR1$', 'ÖR', 'ÖR', |
||
563 | 'HE(HIUY)--1', 'H', None, |
||
564 | 'HIH(AÄEIOÖUÜY)-1', 'IH', None, |
||
565 | 'HLH(AÄEIOÖUÜY)-1', 'LH', None, |
||
566 | 'HMH(AÄEIOÖUÜY)-1', 'MH', None, |
||
567 | 'HNH(AÄEIOÖUÜY)-1', 'NH', None, |
||
568 | 'HOBBY9^', 'HOBI', None, |
||
569 | 'HOCHBEGAB-----^', 'HOCH ', 'UK ', |
||
570 | 'HOCHTALEN-----^', 'HOCH ', 'UK ', |
||
571 | 'HOCHZUFRI-----^', 'HOCH ', 'UK ', |
||
572 | 'HO(HIY)--1', 'H', None, |
||
573 | 'HRH(AÄEIOÖUÜY)-1', 'RH', None, |
||
574 | 'HUH(AÄEIOÖUÜY)-1', 'UH', None, |
||
575 | 'HUIS^^', 'HÜS', 'IZ', |
||
576 | 'HUIS$', 'ÜS', 'IZ', |
||
577 | 'HUI--1', 'H', None, |
||
578 | 'HYGIEN^', 'HÜKIEN', None, |
||
579 | 'HY9^', 'HÜ', None, |
||
580 | 'HY(BDGMNPST)-', 'Ü', None, |
||
581 | 'H.^', None, 'H.', |
||
582 | 'HÄU--1', 'H', None, |
||
583 | 'H^', 'H', '', |
||
584 | 'H', '', '', |
||
585 | 'ICHELL---', 'ISH', 'IZ', |
||
586 | 'ICHI$', 'ISHI', 'IZI', |
||
587 | 'IEC$', 'IZ', 'IZ', |
||
588 | 'IEDENSTELLE------', 'IDN ', 'ITN ', |
||
589 | 'IEI-3', '', '', |
||
590 | 'IELL3', 'IEL', 'IEL', |
||
591 | 'IENNE$', 'IN', 'IN', |
||
592 | 'IERRE$', 'IER', 'IER', |
||
593 | 'IERZULAN---', 'IR ZU ', 'IR ZU ', |
||
594 | 'IETTE$', 'IT', 'IT', |
||
595 | 'IEU', 'IÖ', 'IÖ', |
||
596 | 'IE<4', 'I', 'I', |
||
597 | 'IGL-1', 'IK', None, |
||
598 | 'IGHT3$', 'EIT', 'EIT', |
||
599 | 'IGNI(EO)-', 'INI', 'INI', |
||
600 | 'IGN(AEOU)-$', 'INI', 'INI', |
||
601 | 'IHER(DGLKRT)--1', 'IHE', None, |
||
602 | 'IHE(IUY)--', 'IH', None, |
||
603 | 'IH(AIOÖUÜY)-', 'IH', None, |
||
604 | 'IJ(AOU)-', 'I', 'I', |
||
605 | 'IJ$', 'I', 'I', |
||
606 | 'IJ<', 'EI', 'EI', |
||
607 | 'IKOLE$', 'IKOL', 'IKUL', |
||
608 | 'ILLAN(STZ)--4', 'ILIA', 'ILIA', |
||
609 | 'ILLAR(DT)--4', 'ILIA', 'ILIA', |
||
610 | 'IMSTAN----^', 'IM ', 'IN ', |
||
611 | 'INDELERREGE------', 'INDL ', 'INTL ', |
||
612 | 'INFRAGE-----^$', 'IN ', 'IN ', |
||
613 | 'INTERN(AOU)-^', 'INTAN', 'INTAN', |
||
614 | 'INVER-', 'INWE', 'INFE', |
||
615 | 'ITI(AÄIOÖUÜ)-', 'IZI', 'IZI', |
||
616 | 'IUSZ$', 'IUS', None, |
||
617 | 'IUTZ$', 'IUS', None, |
||
618 | 'IUZ$', 'IUS', None, |
||
619 | 'IVER--<', 'IW', None, |
||
620 | 'IVIER$', 'IWIE', 'IFIE', |
||
621 | 'IV(ÄOÖUÜ)-', 'IW', None, |
||
622 | 'IV<3', 'IW', None, |
||
623 | 'IY2', 'I', None, |
||
624 | 'I(ÈÉÊ)<4', 'I', 'I', |
||
625 | 'JAVIE---<^', 'ZA', 'ZA', |
||
626 | 'JEANS^$', 'JINS', 'INZ', |
||
627 | 'JEANNE^$', 'IAN', 'IAN', |
||
628 | 'JEAN-^', 'IA', 'IA', |
||
629 | 'JER-^', 'IE', 'IE', |
||
630 | 'JE(LMNST)-', 'IE', 'IE', |
||
631 | 'JI^', 'JI', None, |
||
632 | 'JOR(GK)^$', 'IÖRK', 'IÖRK', |
||
633 | 'J', 'I', 'I', |
||
634 | 'KC(ÄEIJ)-', 'X', 'X', |
||
635 | 'KD', 'KT', None, |
||
636 | 'KE(LMNRST)-3^', 'KE', 'KE', |
||
637 | 'KG(AÄEILOÖRUÜY)-', 'K', None, |
||
638 | 'KH<^', 'K', 'K', |
||
639 | 'KIC$', 'KIZ', 'KIZ', |
||
640 | 'KLE(LMNRST)-3^', 'KLE', 'KLE', |
||
641 | 'KOTELE-^', 'KOTL', 'KUTL', |
||
642 | 'KREAT-^', 'KREA', 'KREA', |
||
643 | 'KRÜS(TZ)--^', 'KRI', None, |
||
644 | 'KRYS(TZ)--^', 'KRI', None, |
||
645 | 'KRY9^', 'KRÜ', None, |
||
646 | 'KSCH---', 'K', 'K', |
||
647 | 'KSH--', 'K', 'K', |
||
648 | 'K(SßXZ)7', 'X', 'X', # implies 'KST' -> 'XT' |
||
649 | 'KT\'S$', 'X', 'X', |
||
650 | 'KTI(AIOU)-3', 'XI', 'XI', |
||
651 | 'KT(SßXZ)', 'X', 'X', |
||
652 | 'KY9^', 'KÜ', None, |
||
653 | 'K\'S$', 'X', 'X', |
||
654 | 'K´S$', 'X', 'X', |
||
655 | 'LANGES$', ' LANGES', ' LANKEZ', |
||
656 | 'LANGE$', ' LANGE', ' LANKE', |
||
657 | 'LANG$', ' LANK', ' LANK', |
||
658 | 'LARVE-', 'LARF', 'LARF', |
||
659 | 'LD(SßZ)$', 'LS', 'LZ', |
||
660 | 'LD\'S$', 'LS', 'LZ', |
||
661 | 'LD´S$', 'LS', 'LZ', |
||
662 | 'LEAND-^', 'LEAN', 'LEAN', |
||
663 | 'LEERSTEHE-----^', 'LER ', 'LER ', |
||
664 | 'LEICHBLEIB-----', 'LEICH ', 'LEIK ', |
||
665 | 'LEICHLAUTE-----', 'LEICH ', 'LEIK ', |
||
666 | 'LEIDERREGE------', 'LEIT ', 'LEIT ', |
||
667 | 'LEIDGEPR----^', 'LEIT ', 'LEIT ', |
||
668 | 'LEINSTEHE-----', 'LEIN ', 'LEIN ', |
||
669 | 'LEL-', 'LE', 'LE', |
||
670 | 'LE(MNRST)-3^', 'LE', 'LE', |
||
671 | 'LETTE$', 'LET', 'LET', |
||
672 | 'LFGNAG-', 'LFGAN', 'LFKAN', |
||
673 | 'LICHERWEIS----', 'LICHA ', 'LIKA ', |
||
674 | 'LIC$', 'LIZ', 'LIZ', |
||
675 | 'LIVE^$', 'LEIF', 'LEIF', |
||
676 | 'LT(SßZ)$', 'LS', 'LZ', |
||
677 | 'LT\'S$', 'LS', 'LZ', |
||
678 | 'LT´S$', 'LS', 'LZ', |
||
679 | 'LUI(GS)--', 'LU', 'LU', |
||
680 | 'LV(AIO)-', 'LW', None, |
||
681 | 'LY9^', 'LÜ', None, |
||
682 | 'LSTS$', 'LS', 'LZ', |
||
683 | 'LZ(BDFGKLMNPQRSTVWX)-', 'LS', None, |
||
684 | 'L(SßZ)$', 'LS', None, |
||
685 | 'MAIR-<', 'MEI', 'NEI', |
||
686 | 'MANAG-', 'MENE', 'NENE', |
||
687 | 'MANUEL', 'MANUEL', None, |
||
688 | 'MASSEU(RS)-', 'MASÖ', 'NAZÖ', |
||
689 | 'MATCH', 'MESH', 'NEZ', |
||
690 | 'MAURICE', 'MORIS', 'NURIZ', |
||
691 | 'MBH^$', 'MBH', 'MBH', |
||
692 | 'MB(ßZ)$', 'MS', None, |
||
693 | 'MB(SßTZ)-', 'M', 'N', |
||
694 | 'MCG9^', 'MAK', 'NAK', |
||
695 | 'MC9^', 'MAK', 'NAK', |
||
696 | 'MEMOIR-^', 'MEMOA', 'NENUA', |
||
697 | 'MERHAVEN$', 'MAHAFN', None, |
||
698 | 'ME(LMNRST)-3^', 'ME', 'NE', |
||
699 | 'MEN(STZ)--3', 'ME', None, |
||
700 | 'MEN$', 'MEN', None, |
||
701 | 'MIGUEL-', 'MIGE', 'NIKE', |
||
702 | 'MIKE^$', 'MEIK', 'NEIK', |
||
703 | 'MITHILFE----^$', 'MIT H', 'NIT ', |
||
704 | 'MN$', 'M', None, |
||
705 | 'MN', 'N', 'N', |
||
706 | 'MPJUTE-', 'MPUT', 'NBUT', |
||
707 | 'MP(ßZ)$', 'MS', None, |
||
708 | 'MP(SßTZ)-', 'M', 'N', |
||
709 | 'MP(BDJLMNPQVW)-', 'MB', 'NB', |
||
710 | 'MY9^', 'MÜ', None, |
||
711 | 'M(ßZ)$', 'MS', None, |
||
712 | 'M´G7^', 'MAK', 'NAK', |
||
713 | 'M\'G7^', 'MAK', 'NAK', |
||
714 | 'M´^', 'MAK', 'NAK', |
||
715 | 'M\'^', 'MAK', 'NAK', |
||
716 | 'M', None, 'N', |
||
717 | 'NACH^^', 'NACH', 'NAK', |
||
718 | 'NADINE', 'NADIN', 'NATIN', |
||
719 | 'NAIV--', 'NA', 'NA', |
||
720 | 'NAISE$', 'NESE', 'NEZE', |
||
721 | 'NAUGENOMM------', 'NAU ', 'NAU ', |
||
722 | 'NAUSOGUT$', 'NAUSO GUT', 'NAUZU KUT', |
||
723 | 'NCH$', 'NSH', 'NZ', |
||
724 | 'NCOISE$', 'SOA', 'ZUA', |
||
725 | 'NCOIS$', 'SOA', 'ZUA', |
||
726 | 'NDAR$', 'NDA', 'NTA', |
||
727 | 'NDERINGEN------', 'NDE ', 'NTE ', |
||
728 | 'NDRO(CDKTZ)-', 'NTRO', None, |
||
729 | 'ND(BFGJLMNPQVW)-', 'NT', None, |
||
730 | 'ND(SßZ)$', 'NS', 'NZ', |
||
731 | 'ND\'S$', 'NS', 'NZ', |
||
732 | 'ND´S$', 'NS', 'NZ', |
||
733 | 'NEBEN^^', 'NEBN', 'NEBN', |
||
734 | 'NENGELERN------', 'NEN ', 'NEN ', |
||
735 | 'NENLERN(ET)---', 'NEN LE', 'NEN LE', |
||
736 | 'NENZULERNE---', 'NEN ZU LE', 'NEN ZU LE', |
||
737 | 'NE(LMNRST)-3^', 'NE', 'NE', |
||
738 | 'NEN-3', 'NE', 'NE', |
||
739 | 'NETTE$', 'NET', 'NET', |
||
740 | 'NGU^^', 'NU', 'NU', |
||
741 | 'NG(BDFJLMNPQRTVW)-', 'NK', 'NK', |
||
742 | 'NH(AUO)-$', 'NI', 'NI', |
||
743 | 'NICHTSAHNEN-----', 'NIX ', 'NIX ', |
||
744 | 'NICHTSSAGE----', 'NIX ', 'NIX ', |
||
745 | 'NICHTS^^', 'NIX', 'NIX', |
||
746 | 'NICHT^^', 'NICHT', 'NIKT', |
||
747 | 'NINE$', 'NIN', 'NIN', |
||
748 | 'NON^^', 'NON', 'NUN', |
||
749 | 'NOTLEIDE-----^', 'NOT ', 'NUT ', |
||
750 | 'NOT^^', 'NOT', 'NUT', |
||
751 | 'NTI(AIOU)-3', 'NZI', 'NZI', |
||
752 | 'NTIEL--3', 'NZI', 'NZI', |
||
753 | 'NT(SßZ)$', 'NS', 'NZ', |
||
754 | 'NT\'S$', 'NS', 'NZ', |
||
755 | 'NT´S$', 'NS', 'NZ', |
||
756 | 'NYLON', 'NEILON', 'NEILUN', |
||
757 | 'NY9^', 'NÜ', None, |
||
758 | 'NSTZUNEH---', 'NST ZU ', 'NZT ZU ', |
||
759 | 'NSZ-', 'NS', None, |
||
760 | 'NSTS$', 'NS', 'NZ', |
||
761 | 'NZ(BDFGKLMNPQRSTVWX)-', 'NS', None, |
||
762 | 'N(SßZ)$', 'NS', None, |
||
763 | 'OBERE-', 'OBER', None, |
||
764 | 'OBER^^', 'OBA', 'UBA', |
||
765 | 'OEU2', 'Ö', 'Ö', |
||
766 | 'OE<2', 'Ö', 'Ö', |
||
767 | 'OGL-', 'OK', None, |
||
768 | 'OGNIE-', 'ONI', 'UNI', |
||
769 | 'OGN(AEOU)-$', 'ONI', 'UNI', |
||
770 | 'OH(AIOÖUÜY)-', 'OH', None, |
||
771 | 'OIE$', 'Ö', 'Ö', |
||
772 | 'OIRE$', 'OA', 'UA', |
||
773 | 'OIR$', 'OA', 'UA', |
||
774 | 'OIX', 'OA', 'UA', |
||
775 | 'OI<3', 'EU', 'EU', |
||
776 | 'OKAY^$', 'OKE', 'UKE', |
||
777 | 'OLYN$', 'OLIN', 'ULIN', |
||
778 | 'OO(DLMZ)-', 'U', None, |
||
779 | 'OO$', 'U', None, |
||
780 | 'OO-', '', '', |
||
781 | 'ORGINAL-----', 'ORI', 'URI', |
||
782 | 'OTI(AÄOÖUÜ)-', 'OZI', 'UZI', |
||
783 | 'OUI^', 'WI', 'FI', |
||
784 | 'OUILLE$', 'ULIE', 'ULIE', |
||
785 | 'OU(DT)-^', 'AU', 'AU', |
||
786 | 'OUSE$', 'AUS', 'AUZ', |
||
787 | 'OUT-', 'AU', 'AU', |
||
788 | 'OU', 'U', 'U', |
||
789 | 'O(FV)$', 'AU', 'AU', # due to 'OW$' -> 'AU' |
||
790 | 'OVER--<', 'OW', None, |
||
791 | 'OV(AOU)-', 'OW', None, |
||
792 | 'OW$', 'AU', 'AU', |
||
793 | 'OWS$', 'OS', 'UZ', |
||
794 | 'OJ(AÄEIOÖUÜ)--', 'O', 'U', |
||
795 | 'OYER', 'OIA', None, |
||
796 | 'OY(AÄEIOÖUÜ)--', 'O', 'U', |
||
797 | 'O(JY)<', 'EU', 'EU', |
||
798 | 'OZ$', 'OS', None, |
||
799 | 'O´^', 'O', 'U', |
||
800 | 'O\'^', 'O', 'U', |
||
801 | 'O', None, 'U', |
||
802 | 'PATIEN--^', 'PAZI', 'PAZI', |
||
803 | 'PENSIO-^', 'PANSI', 'PANZI', |
||
804 | 'PE(LMNRST)-3^', 'PE', 'PE', |
||
805 | 'PFER-^', 'FE', 'FE', |
||
806 | 'P(FH)<', 'F', 'F', |
||
807 | 'PIC^$', 'PIK', 'PIK', |
||
808 | 'PIC$', 'PIZ', 'PIZ', |
||
809 | 'PIPELINE', 'PEIBLEIN', 'PEIBLEIN', |
||
810 | 'POLYP-', 'POLÜ', None, |
||
811 | 'POLY^^', 'POLI', 'PULI', |
||
812 | 'PORTRAIT7', 'PORTRE', 'PURTRE', |
||
813 | 'POWER7', 'PAUA', 'PAUA', |
||
814 | 'PP(FH)--<', 'B', 'B', |
||
815 | 'PP-', '', '', |
||
816 | 'PRODUZ-^', 'PRODU', 'BRUTU', |
||
817 | 'PRODUZI--', ' PRODU', ' BRUTU', |
||
818 | 'PRIX^$', 'PRI', 'PRI', |
||
819 | 'PS-^^', 'P', None, |
||
820 | 'P(SßZ)^', None, 'Z', |
||
821 | 'P(SßZ)$', 'BS', None, |
||
822 | 'PT-^', '', '', |
||
823 | 'PTI(AÄOÖUÜ)-3', 'BZI', 'BZI', |
||
824 | 'PY9^', 'PÜ', None, |
||
825 | 'P(AÄEIOÖRUÜY)-', 'P', 'P', |
||
826 | 'P(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'P', None, |
||
827 | 'P.^', None, 'P.', |
||
828 | 'P^', 'P', None, |
||
829 | 'P', 'B', 'B', |
||
830 | 'QI-', 'Z', 'Z', |
||
831 | 'QUARANT--', 'KARA', 'KARA', |
||
832 | 'QUE(LMNRST)-3', 'KWE', 'KFE', |
||
833 | 'QUE$', 'K', 'K', |
||
834 | 'QUI(NS)$', 'KI', 'KI', |
||
835 | 'QUIZ7', 'KWIS', None, |
||
836 | 'Q(UV)7', 'KW', 'KF', |
||
837 | 'Q<', 'K', 'K', |
||
838 | 'RADFAHR----', 'RAT ', 'RAT ', |
||
839 | 'RAEFTEZEHRE-----', 'REFTE ', 'REFTE ', |
||
840 | 'RCH', 'RCH', 'RK', |
||
841 | 'REA(DU)---3^', 'R', None, |
||
842 | 'REBSERZEUG------', 'REBS ', 'REBZ ', |
||
843 | 'RECHERCH^', 'RESHASH', 'REZAZ', |
||
844 | 'RECYCL--', 'RIZEI', 'RIZEI', |
||
845 | 'RE(ALST)-3^', 'RE', None, |
||
846 | 'REE$', 'RI', 'RI', |
||
847 | 'RER$', 'RA', 'RA', |
||
848 | 'RE(MNR)-4', 'RE', 'RE', |
||
849 | 'RETTE$', 'RET', 'RET', |
||
850 | 'REUZ$', 'REUZ', None, |
||
851 | 'REW$', 'RU', 'RU', |
||
852 | 'RH<^', 'R', 'R', |
||
853 | 'RJA(MN)--', 'RI', 'RI', |
||
854 | 'ROWD-^', 'RAU', 'RAU', |
||
855 | 'RTEMONNAIE-', 'RTMON', 'RTNUN', |
||
856 | 'RTI(AÄOÖUÜ)-3', 'RZI', 'RZI', |
||
857 | 'RTIEL--3', 'RZI', 'RZI', |
||
858 | 'RV(AEOU)-3', 'RW', None, |
||
859 | 'RY(KN)-$', 'RI', 'RI', |
||
860 | 'RY9^', 'RÜ', None, |
||
861 | 'RÄFTEZEHRE-----', 'REFTE ', 'REFTE ', |
||
862 | 'SAISO-^', 'SES', 'ZEZ', |
||
863 | 'SAFE^$', 'SEIF', 'ZEIF', |
||
864 | 'SAUCE-^', 'SOS', 'ZUZ', |
||
865 | 'SCHLAGGEBEN-----<', 'SHLAK ', 'ZLAK ', |
||
866 | 'SCHSCH---7', '', '', |
||
867 | 'SCHTSCH', 'SH', 'Z', |
||
868 | 'SC(HZ)<', 'SH', 'Z', |
||
869 | 'SC', 'SK', 'ZK', |
||
870 | 'SELBSTST--7^^', 'SELB', 'ZELB', |
||
871 | 'SELBST7^^', 'SELBST', 'ZELBZT', |
||
872 | 'SERVICE7^', 'SÖRWIS', 'ZÖRFIZ', |
||
873 | 'SERVI-^', 'SERW', None, |
||
874 | 'SE(LMNRST)-3^', 'SE', 'ZE', |
||
875 | 'SETTE$', 'SET', 'ZET', |
||
876 | 'SHP-^', 'S', 'Z', |
||
877 | 'SHST', 'SHT', 'ZT', |
||
878 | 'SHTSH', 'SH', 'Z', |
||
879 | 'SHT', 'ST', 'Z', |
||
880 | 'SHY9^', 'SHÜ', None, |
||
881 | 'SH^^', 'SH', None, |
||
882 | 'SH3', 'SH', 'Z', |
||
883 | 'SICHERGEGAN-----^', 'SICHA ', 'ZIKA ', |
||
884 | 'SICHERGEHE----^', 'SICHA ', 'ZIKA ', |
||
885 | 'SICHERGESTEL------^', 'SICHA ', 'ZIKA ', |
||
886 | 'SICHERSTELL-----^', 'SICHA ', 'ZIKA ', |
||
887 | 'SICHERZU(GS)--^', 'SICHA ZU ', 'ZIKA ZU ', |
||
888 | 'SIEGLI-^', 'SIKL', 'ZIKL', |
||
889 | 'SIGLI-^', 'SIKL', 'ZIKL', |
||
890 | 'SIGHT', 'SEIT', 'ZEIT', |
||
891 | 'SIGN', 'SEIN', 'ZEIN', |
||
892 | 'SKI(NPZ)-', 'SKI', 'ZKI', |
||
893 | 'SKI<^', 'SHI', 'ZI', |
||
894 | 'SODASS^$', 'SO DAS', 'ZU TAZ', |
||
895 | 'SODAß^$', 'SO DAS', 'ZU TAZ', |
||
896 | 'SOGENAN--^', 'SO GEN', 'ZU KEN', |
||
897 | 'SOUND-', 'SAUN', 'ZAUN', |
||
898 | 'STAATS^^', 'STAZ', 'ZTAZ', |
||
899 | 'STADT^^', 'STAT', 'ZTAT', |
||
900 | 'STANDE$', ' STANDE', ' ZTANTE', |
||
901 | 'START^^', 'START', 'ZTART', |
||
902 | 'STAURANT7', 'STORAN', 'ZTURAN', |
||
903 | 'STEAK-', 'STE', 'ZTE', |
||
904 | 'STEPHEN-^$', 'STEW', None, |
||
905 | 'STERN', 'STERN', None, |
||
906 | 'STRAF^^', 'STRAF', 'ZTRAF', |
||
907 | 'ST\'S$', 'Z', 'Z', |
||
908 | 'ST´S$', 'Z', 'Z', |
||
909 | 'STST--', '', '', |
||
910 | 'STS(ACEÈÉÊHIÌÍÎOUÄÜÖ)--', 'ST', 'ZT', |
||
911 | 'ST(SZ)', 'Z', 'Z', |
||
912 | 'SPAREN---^', 'SPA', 'ZPA', |
||
913 | 'SPAREND----', ' SPA', ' ZPA', |
||
914 | 'S(PTW)-^^', 'S', None, |
||
915 | 'SP', 'SP', None, |
||
916 | 'STYN(AE)-$', 'STIN', 'ZTIN', |
||
917 | 'ST', 'ST', 'ZT', |
||
918 | 'SUITE<', 'SIUT', 'ZIUT', |
||
919 | 'SUKE--$', 'S', 'Z', |
||
920 | 'SURF(EI)-', 'SÖRF', 'ZÖRF', |
||
921 | 'SV(AEÈÉÊIÌÍÎOU)-<^', 'SW', None, |
||
922 | 'SYB(IY)--^', 'SIB', None, |
||
923 | 'SYL(KVW)--^', 'SI', None, |
||
924 | 'SY9^', 'SÜ', None, |
||
925 | 'SZE(NPT)-^', 'ZE', 'ZE', |
||
926 | 'SZI(ELN)-^', 'ZI', 'ZI', |
||
927 | 'SZCZ<', 'SH', 'Z', |
||
928 | 'SZT<', 'ST', 'ZT', |
||
929 | 'SZ<3', 'SH', 'Z', |
||
930 | 'SÜL(KVW)--^', 'SI', None, |
||
931 | 'S', None, 'Z', |
||
932 | 'TCH', 'SH', 'Z', |
||
933 | 'TD(AÄEIOÖRUÜY)-', 'T', None, |
||
934 | 'TD(ÀÁÂÃÅÈÉÊËÌÍÎÏÒÓÔÕØÙÚÛÝŸ)-', 'T', None, |
||
935 | 'TEAT-^', 'TEA', 'TEA', |
||
936 | 'TERRAI7^', 'TERA', 'TERA', |
||
937 | 'TE(LMNRST)-3^', 'TE', 'TE', |
||
938 | 'TH<', 'T', 'T', |
||
939 | 'TICHT-', 'TIK', 'TIK', |
||
940 | 'TICH$', 'TIK', 'TIK', |
||
941 | 'TIC$', 'TIZ', 'TIZ', |
||
942 | 'TIGGESTELL-------', 'TIK ', 'TIK ', |
||
943 | 'TIGSTELL-----', 'TIK ', 'TIK ', |
||
944 | 'TOAS-^', 'TO', 'TU', |
||
945 | 'TOILET-', 'TOLE', 'TULE', |
||
946 | 'TOIN-', 'TOA', 'TUA', |
||
947 | 'TRAECHTI-^', 'TRECHT', 'TREKT', |
||
948 | 'TRAECHTIG--', ' TRECHT', ' TREKT', |
||
949 | 'TRAINI-', 'TREN', 'TREN', |
||
950 | 'TRÄCHTI-^', 'TRECHT', 'TREKT', |
||
951 | 'TRÄCHTIG--', ' TRECHT', ' TREKT', |
||
952 | 'TSCH', 'SH', 'Z', |
||
953 | 'TSH', 'SH', 'Z', |
||
954 | 'TST', 'ZT', 'ZT', |
||
955 | 'T(Sß)', 'Z', 'Z', |
||
956 | 'TT(SZ)--<', '', '', |
||
957 | 'TT9', 'T', 'T', |
||
958 | 'TV^$', 'TV', 'TV', |
||
959 | 'TX(AEIOU)-3', 'SH', 'Z', |
||
960 | 'TY9^', 'TÜ', None, |
||
961 | 'TZ-', '', '', |
||
962 | 'T\'S3$', 'Z', 'Z', |
||
963 | 'T´S3$', 'Z', 'Z', |
||
964 | 'UEBEL(GNRW)-^^', 'ÜBL ', 'IBL ', |
||
965 | 'UEBER^^', 'ÜBA', 'IBA', |
||
966 | 'UE2', 'Ü', 'I', |
||
967 | 'UGL-', 'UK', None, |
||
968 | 'UH(AOÖUÜY)-', 'UH', None, |
||
969 | 'UIE$', 'Ü', 'I', |
||
970 | 'UM^^', 'UM', 'UN', |
||
971 | 'UNTERE--3', 'UNTE', 'UNTE', |
||
972 | 'UNTER^^', 'UNTA', 'UNTA', |
||
973 | 'UNVER^^', 'UNFA', 'UNFA', |
||
974 | 'UN^^', 'UN', 'UN', |
||
975 | 'UTI(AÄOÖUÜ)-', 'UZI', 'UZI', |
||
976 | 'UVE-4', 'UW', None, |
||
977 | 'UY2', 'UI', None, |
||
978 | 'UZZ', 'AS', 'AZ', |
||
979 | 'VACL-^', 'WAZ', 'FAZ', |
||
980 | 'VAC$', 'WAZ', 'FAZ', |
||
981 | 'VAN DEN ^', 'FANDN', 'FANTN', |
||
982 | 'VANES-^', 'WANE', None, |
||
983 | 'VATRO-', 'WATR', None, |
||
984 | 'VA(DHJNT)--^', 'F', None, |
||
985 | 'VEDD-^', 'FE', 'FE', |
||
986 | 'VE(BEHIU)--^', 'F', None, |
||
987 | 'VEL(BDLMNT)-^', 'FEL', None, |
||
988 | 'VENTZ-^', 'FEN', None, |
||
989 | 'VEN(NRSZ)-^', 'FEN', None, |
||
990 | 'VER(AB)-^$', 'WER', None, |
||
991 | 'VERBAL^$', 'WERBAL', None, |
||
992 | 'VERBAL(EINS)-^', 'WERBAL', None, |
||
993 | 'VERTEBR--', 'WERTE', None, |
||
994 | 'VEREIN-----', 'F', None, |
||
995 | 'VEREN(AEIOU)-^', 'WEREN', None, |
||
996 | 'VERIFI', 'WERIFI', None, |
||
997 | 'VERON(AEIOU)-^', 'WERON', None, |
||
998 | 'VERSEN^', 'FERSN', 'FAZN', |
||
999 | 'VERSIERT--^', 'WERSI', None, |
||
1000 | 'VERSIO--^', 'WERS', None, |
||
1001 | 'VERSUS', 'WERSUS', None, |
||
1002 | 'VERTI(GK)-', 'WERTI', None, |
||
1003 | 'VER^^', 'FER', 'FA', |
||
1004 | 'VERSPRECHE-------', ' FER', ' FA', |
||
1005 | 'VER$', 'WA', None, |
||
1006 | 'VER', 'FA', 'FA', |
||
1007 | 'VET(HT)-^', 'FET', 'FET', |
||
1008 | 'VETTE$', 'WET', 'FET', |
||
1009 | 'VE^', 'WE', None, |
||
1010 | 'VIC$', 'WIZ', 'FIZ', |
||
1011 | 'VIELSAGE----', 'FIL ', 'FIL ', |
||
1012 | 'VIEL', 'FIL', 'FIL', |
||
1013 | 'VIEW', 'WIU', 'FIU', |
||
1014 | 'VILL(AE)-', 'WIL', None, |
||
1015 | 'VIS(ACEIKUVWZ)-<^', 'WIS', None, |
||
1016 | 'VI(ELS)--^', 'F', None, |
||
1017 | 'VILLON--', 'WILI', 'FILI', |
||
1018 | 'VIZE^^', 'FIZE', 'FIZE', |
||
1019 | 'VLIE--^', 'FL', None, |
||
1020 | 'VL(AEIOU)--', 'W', None, |
||
1021 | 'VOKA-^', 'WOK', None, |
||
1022 | 'VOL(ATUVW)--^', 'WO', None, |
||
1023 | 'VOR^^', 'FOR', 'FUR', |
||
1024 | 'VR(AEIOU)--', 'W', None, |
||
1025 | 'VV9', 'W', None, |
||
1026 | 'VY9^', 'WÜ', 'FI', |
||
1027 | 'V(ÜY)-', 'W', None, |
||
1028 | 'V(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'W', None, |
||
1029 | 'V(AEIJLRU)-<', 'W', None, |
||
1030 | 'V.^', 'V.', None, |
||
1031 | 'V<', 'F', 'F', |
||
1032 | 'WEITERENTWI-----^', 'WEITA ', 'FEITA ', |
||
1033 | 'WEITREICH-----^', 'WEIT ', 'FEIT ', |
||
1034 | 'WEITVER^', 'WEIT FER', 'FEIT FA', |
||
1035 | 'WE(LMNRST)-3^', 'WE', 'FE', |
||
1036 | 'WER(DST)-', 'WER', None, |
||
1037 | 'WIC$', 'WIZ', 'FIZ', |
||
1038 | 'WIEDERU--', 'WIDE', 'FITE', |
||
1039 | 'WIEDER^$', 'WIDA', 'FITA', |
||
1040 | 'WIEDER^^', 'WIDA ', 'FITA ', |
||
1041 | 'WIEVIEL', 'WI FIL', 'FI FIL', |
||
1042 | 'WISUEL', 'WISUEL', None, |
||
1043 | 'WR-^', 'W', None, |
||
1044 | 'WY9^', 'WÜ', 'FI', |
||
1045 | 'W(BDFGJKLMNPQRSTZ)-', 'F', None, |
||
1046 | 'W$', 'F', None, |
||
1047 | 'W', None, 'F', |
||
1048 | 'X<^', 'Z', 'Z', |
||
1049 | 'XHAVEN$', 'XAFN', None, |
||
1050 | 'X(CSZ)', 'X', 'X', |
||
1051 | 'XTS(CH)--', 'XT', 'XT', |
||
1052 | 'XT(SZ)', 'Z', 'Z', |
||
1053 | 'YE(LMNRST)-3^', 'IE', 'IE', |
||
1054 | 'YE-3', 'I', 'I', |
||
1055 | 'YOR(GK)^$', 'IÖRK', 'IÖRK', |
||
1056 | 'Y(AOU)-<7', 'I', 'I', |
||
1057 | 'Y(BKLMNPRSTX)-1', 'Ü', None, |
||
1058 | 'YVES^$', 'IF', 'IF', |
||
1059 | 'YVONNE^$', 'IWON', 'IFUN', |
||
1060 | 'Y.^', 'Y.', None, |
||
1061 | 'Y', 'I', 'I', |
||
1062 | 'ZC(AOU)-', 'SK', 'ZK', |
||
1063 | 'ZE(LMNRST)-3^', 'ZE', 'ZE', |
||
1064 | 'ZIEJ$', 'ZI', 'ZI', |
||
1065 | 'ZIGERJA(HR)-3', 'ZIGA IA', 'ZIKA IA', |
||
1066 | 'ZL(AEIOU)-', 'SL', None, |
||
1067 | 'ZS(CHT)--', '', '', |
||
1068 | 'ZS', 'SH', 'Z', |
||
1069 | 'ZUERST', 'ZUERST', 'ZUERST', |
||
1070 | 'ZUGRUNDE^$', 'ZU GRUNDE', 'ZU KRUNTE', |
||
1071 | 'ZUGRUNDE', 'ZU GRUNDE ', 'ZU KRUNTE ', |
||
1072 | 'ZUGUNSTEN', 'ZU GUNSTN', 'ZU KUNZTN', |
||
1073 | 'ZUHAUSE-', 'ZU HAUS', 'ZU AUZ', |
||
1074 | 'ZULASTEN^$', 'ZU LASTN', 'ZU LAZTN', |
||
1075 | 'ZURUECK^^', 'ZURÜK', 'ZURIK', |
||
1076 | 'ZURZEIT', 'ZUR ZEIT', 'ZUR ZEIT', |
||
1077 | 'ZURÜCK^^', 'ZURÜK', 'ZURIK', |
||
1078 | 'ZUSTANDE', 'ZU STANDE', 'ZU ZTANTE', |
||
1079 | 'ZUTAGE', 'ZU TAGE', 'ZU TAKE', |
||
1080 | 'ZUVER^^', 'ZUFA', 'ZUFA', |
||
1081 | 'ZUVIEL', 'ZU FIL', 'ZU FIL', |
||
1082 | 'ZUWENIG', 'ZU WENIK', 'ZU FENIK', |
||
1083 | 'ZY9^', 'ZÜ', None, |
||
1084 | 'ZYK3$', 'ZIK', None, |
||
1085 | 'Z(VW)7^', 'SW', None, |
||
1086 | None, None, None |
||
1087 | # fmt: on |
||
1088 | ) |
||
1089 | |||
1090 | 1 | phonet_hash = Counter() |
|
1091 | 1 | alpha_pos = Counter() |
|
1092 | |||
1093 | 1 | phonet_hash_1 = Counter() |
|
1094 | 1 | phonet_hash_2 = Counter() |
|
1095 | |||
1096 | 1 | _phonet_upper_translation = dict( |
|
1097 | zip( |
||
1098 | ( |
||
1099 | ord(_) |
||
1100 | for _ in 'abcdefghijklmnopqrstuvwxyzàáâãåäæ' |
||
1101 | + 'çðèéêëìíîïñòóôõöøœšßþùúûüýÿ' |
||
1102 | ), |
||
1103 | 'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÅÄÆ' |
||
1104 | + 'ÇÐÈÉÊËÌÍÎÏÑÒÓÔÕÖØŒŠßÞÙÚÛÜÝŸ', |
||
1105 | ) |
||
1106 | ) |
||
1107 | |||
1108 | 1 | def _initialize_phonet(lang): |
|
1109 | """Initialize phonet variables.""" |
||
1110 | 1 | if lang == 'none': |
|
1111 | 1 | _phonet_rules = _phonet_rules_no_lang |
|
1112 | else: |
||
1113 | 1 | _phonet_rules = _phonet_rules_german |
|
1114 | |||
1115 | 1 | phonet_hash[''] = -1 |
|
1116 | |||
1117 | # German and international umlauts |
||
1118 | 1 | for j in { |
|
1119 | 'À', |
||
1120 | 'Á', |
||
1121 | 'Â', |
||
1122 | 'Ã', |
||
1123 | 'Ä', |
||
1124 | 'Å', |
||
1125 | 'Æ', |
||
1126 | 'Ç', |
||
1127 | 'È', |
||
1128 | 'É', |
||
1129 | 'Ê', |
||
1130 | 'Ë', |
||
1131 | 'Ì', |
||
1132 | 'Í', |
||
1133 | 'Î', |
||
1134 | 'Ï', |
||
1135 | 'Ð', |
||
1136 | 'Ñ', |
||
1137 | 'Ò', |
||
1138 | 'Ó', |
||
1139 | 'Ô', |
||
1140 | 'Õ', |
||
1141 | 'Ö', |
||
1142 | 'Ø', |
||
1143 | 'Ù', |
||
1144 | 'Ú', |
||
1145 | 'Û', |
||
1146 | 'Ü', |
||
1147 | 'Ý', |
||
1148 | 'Þ', |
||
1149 | 'ß', |
||
1150 | 'Œ', |
||
1151 | 'Š', |
||
1152 | 'Ÿ', |
||
1153 | }: |
||
1154 | 1 | alpha_pos[j] = 1 |
|
1155 | 1 | phonet_hash[j] = -1 |
|
1156 | |||
1157 | # "normal" letters ('A'-'Z') |
||
1158 | 1 | for i, j in enumerate('ABCDEFGHIJKLMNOPQRSTUVWXYZ'): |
|
1159 | 1 | alpha_pos[j] = i + 2 |
|
1160 | 1 | phonet_hash[j] = -1 |
|
1161 | |||
1162 | 1 | for i in range(26): |
|
1163 | 1 | for j in range(28): |
|
1164 | 1 | phonet_hash_1[i, j] = -1 |
|
1165 | 1 | phonet_hash_2[i, j] = -1 |
|
1166 | |||
1167 | # for each phonetc rule |
||
1168 | 1 | for i in range(len(_phonet_rules)): |
|
1169 | 1 | rule = _phonet_rules[i] |
|
1170 | |||
1171 | 1 | if rule and i % 3 == 0: |
|
1172 | # calculate first hash value |
||
1173 | 1 | k = _phonet_rules[i][0] |
|
1174 | |||
1175 | 1 | if phonet_hash[k] < 0 and ( |
|
1176 | _phonet_rules[i + 1] or _phonet_rules[i + 2] |
||
1177 | ): |
||
1178 | 1 | phonet_hash[k] = i |
|
1179 | |||
1180 | # calculate second hash values |
||
1181 | 1 | if k and alpha_pos[k] >= 2: |
|
1182 | 1 | k = alpha_pos[k] |
|
1183 | |||
1184 | 1 | j = k - 2 |
|
1185 | 1 | rule = rule[1:] |
|
1186 | |||
1187 | 1 | if not rule: |
|
1188 | 1 | rule = ' ' |
|
1189 | 1 | elif rule[0] == '(': |
|
1190 | 1 | rule = rule[1:] |
|
1191 | else: |
||
1192 | 1 | rule = rule[0] |
|
1193 | |||
1194 | 1 | while rule and (rule[0] != ')'): |
|
1195 | 1 | k = alpha_pos[rule[0]] |
|
1196 | |||
1197 | 1 | if k > 0: |
|
1198 | # add hash value for this letter |
||
1199 | 1 | if phonet_hash_1[j, k] < 0: |
|
1200 | 1 | phonet_hash_1[j, k] = i |
|
1201 | 1 | phonet_hash_2[j, k] = i |
|
1202 | |||
1203 | 1 | if phonet_hash_2[j, k] >= (i - 30): |
|
1204 | 1 | phonet_hash_2[j, k] = i |
|
1205 | else: |
||
1206 | 1 | k = -1 |
|
1207 | |||
1208 | 1 | if k <= 0: |
|
1209 | # add hash value for all letters |
||
1210 | 1 | if phonet_hash_1[j, 0] < 0: |
|
1211 | 1 | phonet_hash_1[j, 0] = i |
|
1212 | |||
1213 | 1 | phonet_hash_2[j, 0] = i |
|
1214 | |||
1215 | 1 | rule = rule[1:] |
|
1216 | |||
1217 | 1 | def _phonet(term, mode, lang): |
|
1218 | """Return the phonet coded form of a term.""" |
||
1219 | 1 | if lang == 'none': |
|
1220 | 1 | _phonet_rules = _phonet_rules_no_lang |
|
1221 | else: |
||
1222 | 1 | _phonet_rules = _phonet_rules_german |
|
1223 | |||
1224 | 1 | char0 = '' |
|
1225 | 1 | dest = term |
|
1226 | |||
1227 | 1 | if not term: |
|
1228 | 1 | return '' |
|
1229 | |||
1230 | 1 | term_length = len(term) |
|
1231 | |||
1232 | # convert input string to upper-case |
||
1233 | 1 | src = term.translate(_phonet_upper_translation) |
|
1234 | |||
1235 | # check "src" |
||
1236 | 1 | i = 0 |
|
1237 | 1 | j = 0 |
|
1238 | 1 | zeta = 0 |
|
1239 | |||
1240 | 1 | while i < len(src): |
|
1241 | 1 | char = src[i] |
|
1242 | |||
1243 | 1 | pos = alpha_pos[char] |
|
1244 | |||
1245 | 1 | if pos >= 2: |
|
1246 | 1 | xpos = pos - 2 |
|
1247 | |||
1248 | 1 | if i + 1 == len(src): |
|
1249 | 1 | pos = alpha_pos[''] |
|
1250 | else: |
||
1251 | 1 | pos = alpha_pos[src[i + 1]] |
|
1252 | |||
1253 | 1 | start1 = phonet_hash_1[xpos, pos] |
|
1254 | 1 | start2 = phonet_hash_1[xpos, 0] |
|
1255 | 1 | end1 = phonet_hash_2[xpos, pos] |
|
1256 | 1 | end2 = phonet_hash_2[xpos, 0] |
|
1257 | |||
1258 | # preserve rule priorities |
||
1259 | 1 | if (start2 >= 0) and ((start1 < 0) or (start2 < start1)): |
|
1260 | 1 | pos = start1 |
|
1261 | 1 | start1 = start2 |
|
1262 | 1 | start2 = pos |
|
1263 | 1 | pos = end1 |
|
1264 | 1 | end1 = end2 |
|
1265 | 1 | end2 = pos |
|
1266 | |||
1267 | 1 | if (end1 >= start2) and (start2 >= 0): |
|
1268 | 1 | if end2 > end1: |
|
1269 | 1 | end1 = end2 |
|
1270 | |||
1271 | 1 | start2 = -1 |
|
1272 | 1 | end2 = -1 |
|
1273 | else: |
||
1274 | 1 | pos = phonet_hash[char] |
|
1275 | 1 | start1 = pos |
|
1276 | 1 | end1 = 10000 |
|
1277 | 1 | start2 = -1 |
|
1278 | 1 | end2 = -1 |
|
1279 | |||
1280 | 1 | pos = start1 |
|
1281 | 1 | zeta0 = 0 |
|
1282 | |||
1283 | 1 | if pos >= 0: |
|
1284 | # check rules for this char |
||
1285 | 1 | while (_phonet_rules[pos] is None) or ( |
|
1286 | _phonet_rules[pos][0] == char |
||
1287 | ): |
||
1288 | 1 | if pos > end1: |
|
1289 | 1 | if start2 > 0: |
|
1290 | 1 | pos = start2 |
|
1291 | 1 | start1 = start2 |
|
1292 | 1 | start2 = -1 |
|
1293 | 1 | end1 = end2 |
|
1294 | 1 | end2 = -1 |
|
1295 | 1 | continue |
|
1296 | |||
1297 | 1 | break |
|
1298 | |||
1299 | 1 | if (_phonet_rules[pos] is None) or ( |
|
1300 | _phonet_rules[pos + mode] is None |
||
1301 | ): |
||
1302 | # no conversion rule available |
||
1303 | 1 | pos += 3 |
|
1304 | 1 | continue |
|
1305 | |||
1306 | # check whole string |
||
1307 | 1 | matches = 1 # number of matching letters |
|
1308 | 1 | priority = 5 # default priority |
|
1309 | 1 | rule = _phonet_rules[pos] |
|
1310 | 1 | rule = rule[1:] |
|
1311 | |||
1312 | 1 | while ( |
|
1313 | rule |
||
1314 | and (len(src) > (i + matches)) |
||
1315 | and (src[i + matches] == rule[0]) |
||
1316 | and not rule[0].isdigit() |
||
1317 | and (rule not in '(-<^$') |
||
1318 | ): |
||
1319 | 1 | matches += 1 |
|
1320 | 1 | rule = rule[1:] |
|
1321 | |||
1322 | 1 | if rule and (rule[0] == '('): |
|
1323 | # check an array of letters |
||
1324 | 1 | if ( |
|
1325 | (len(src) > (i + matches)) |
||
1326 | and src[i + matches].isalpha() |
||
1327 | and (src[i + matches] in rule[1:]) |
||
1328 | ): |
||
1329 | 1 | matches += 1 |
|
1330 | |||
1331 | 1 | while rule and rule[0] != ')': |
|
1332 | 1 | rule = rule[1:] |
|
1333 | |||
1334 | # if rule[0] == ')': |
||
1335 | 1 | rule = rule[1:] |
|
1336 | |||
1337 | 1 | if rule: |
|
1338 | 1 | priority0 = ord(rule[0]) |
|
1339 | else: |
||
1340 | 1 | priority0 = 0 |
|
1341 | |||
1342 | 1 | matches0 = matches |
|
1343 | |||
1344 | 1 | while rule and rule[0] == '-' and matches > 1: |
|
1345 | 1 | matches -= 1 |
|
1346 | 1 | rule = rule[1:] |
|
1347 | |||
1348 | 1 | if rule and rule[0] == '<': |
|
1349 | 1 | rule = rule[1:] |
|
1350 | |||
1351 | 1 | if rule and rule[0].isdigit(): |
|
1352 | # read priority |
||
1353 | 1 | priority = int(rule[0]) |
|
1354 | 1 | rule = rule[1:] |
|
1355 | |||
1356 | 1 | if rule and rule[0:2] == '^^': |
|
1357 | 1 | rule = rule[1:] |
|
1358 | |||
1359 | 1 | if ( |
|
1360 | not rule |
||
1361 | or ( |
||
1362 | (rule[0] == '^') |
||
1363 | and ((i == 0) or not src[i - 1].isalpha()) |
||
1364 | and ( |
||
1365 | (rule[1:2] != '$') |
||
1366 | or ( |
||
1367 | not ( |
||
1368 | src[ |
||
1369 | i + matches0 : i + matches0 + 1 |
||
1370 | ].isalpha() |
||
1371 | ) |
||
1372 | and ( |
||
1373 | src[i + matches0 : i + matches0 + 1] |
||
1374 | != '.' |
||
1375 | ) |
||
1376 | ) |
||
1377 | ) |
||
1378 | ) |
||
1379 | or ( |
||
1380 | (rule[0] == '$') |
||
1381 | and (i > 0) |
||
1382 | and src[i - 1].isalpha() |
||
1383 | and ( |
||
1384 | ( |
||
1385 | not src[ |
||
1386 | i + matches0 : i + matches0 + 1 |
||
1387 | ].isalpha() |
||
1388 | ) |
||
1389 | and ( |
||
1390 | src[i + matches0 : i + matches0 + 1] != '.' |
||
1391 | ) |
||
1392 | ) |
||
1393 | ) |
||
1394 | ): |
||
1395 | # look for continuation, if: |
||
1396 | # matches > 1 und NO '-' in first string */ |
||
1397 | 1 | pos0 = -1 |
|
1398 | |||
1399 | 1 | start3 = 0 |
|
1400 | 1 | start4 = 0 |
|
1401 | 1 | end3 = 0 |
|
1402 | 1 | end4 = 0 |
|
1403 | |||
1404 | 1 | if ( |
|
1405 | (matches > 1) |
||
1406 | and src[i + matches : i + matches + 1] |
||
1407 | and (priority0 != ord('-')) |
||
1408 | ): |
||
1409 | 1 | char0 = src[i + matches - 1] |
|
1410 | 1 | pos0 = alpha_pos[char0] |
|
1411 | |||
1412 | 1 | if pos0 >= 2 and src[i + matches]: |
|
1413 | 1 | xpos = pos0 - 2 |
|
1414 | 1 | pos0 = alpha_pos[src[i + matches]] |
|
1415 | 1 | start3 = phonet_hash_1[xpos, pos0] |
|
1416 | 1 | start4 = phonet_hash_1[xpos, 0] |
|
1417 | 1 | end3 = phonet_hash_2[xpos, pos0] |
|
1418 | 1 | end4 = phonet_hash_2[xpos, 0] |
|
1419 | |||
1420 | # preserve rule priorities |
||
1421 | 1 | if (start4 >= 0) and ( |
|
1422 | (start3 < 0) or (start4 < start3) |
||
1423 | ): |
||
1424 | 1 | pos0 = start3 |
|
1425 | 1 | start3 = start4 |
|
1426 | 1 | start4 = pos0 |
|
1427 | 1 | pos0 = end3 |
|
1428 | 1 | end3 = end4 |
|
1429 | 1 | end4 = pos0 |
|
1430 | |||
1431 | 1 | if (end3 >= start4) and (start4 >= 0): |
|
1432 | 1 | if end4 > end3: |
|
1433 | 1 | end3 = end4 |
|
1434 | |||
1435 | 1 | start4 = -1 |
|
1436 | 1 | end4 = -1 |
|
1437 | else: |
||
1438 | 1 | pos0 = phonet_hash[char0] |
|
1439 | 1 | start3 = pos0 |
|
1440 | 1 | end3 = 10000 |
|
1441 | 1 | start4 = -1 |
|
1442 | 1 | end4 = -1 |
|
1443 | |||
1444 | 1 | pos0 = start3 |
|
1445 | |||
1446 | # check continuation rules for src[i+matches] |
||
1447 | 1 | if pos0 >= 0: |
|
1448 | 1 | while (_phonet_rules[pos0] is None) or ( |
|
1449 | _phonet_rules[pos0][0] == char0 |
||
1450 | ): |
||
1451 | 1 | if pos0 > end3: |
|
1452 | 1 | if start4 > 0: |
|
1453 | 1 | pos0 = start4 |
|
1454 | 1 | start3 = start4 |
|
1455 | 1 | start4 = -1 |
|
1456 | 1 | end3 = end4 |
|
1457 | 1 | end4 = -1 |
|
1458 | 1 | continue |
|
1459 | |||
1460 | 1 | priority0 = -1 |
|
1461 | |||
1462 | # important |
||
1463 | 1 | break |
|
1464 | |||
1465 | 1 | if (_phonet_rules[pos0] is None) or ( |
|
1466 | _phonet_rules[pos0 + mode] is None |
||
1467 | ): |
||
1468 | # no conversion rule available |
||
1469 | 1 | pos0 += 3 |
|
1470 | 1 | continue |
|
1471 | |||
1472 | # check whole string |
||
1473 | 1 | matches0 = matches |
|
1474 | 1 | priority0 = 5 |
|
1475 | 1 | rule = _phonet_rules[pos0] |
|
1476 | 1 | rule = rule[1:] |
|
1477 | |||
1478 | 1 | while ( |
|
1479 | rule |
||
1480 | and ( |
||
1481 | src[i + matches0 : i + matches0 + 1] |
||
1482 | == rule[0] |
||
1483 | ) |
||
1484 | and ( |
||
1485 | not rule[0].isdigit() |
||
1486 | or (rule in '(-<^$') |
||
1487 | ) |
||
1488 | ): |
||
1489 | 1 | matches0 += 1 |
|
1490 | 1 | rule = rule[1:] |
|
1491 | |||
1492 | 1 | if rule and rule[0] == '(': |
|
1493 | # check an array of letters |
||
1494 | 1 | if src[ |
|
1495 | i + matches0 : i + matches0 + 1 |
||
1496 | ].isalpha() and ( |
||
1497 | src[i + matches0] in rule[1:] |
||
1498 | ): |
||
1499 | 1 | matches0 += 1 |
|
1500 | |||
1501 | 1 | while rule and rule[0] != ')': |
|
1502 | 1 | rule = rule[1:] |
|
1503 | |||
1504 | # if rule[0] == ')': |
||
1505 | 1 | rule = rule[1:] |
|
1506 | |||
1507 | 1 | while rule and rule[0] == '-': |
|
1508 | # "matches0" is NOT decremented |
||
1509 | # because of "if (matches0 == matches)" |
||
1510 | 1 | rule = rule[1:] |
|
1511 | |||
1512 | 1 | if rule and rule[0] == '<': |
|
1513 | 1 | rule = rule[1:] |
|
1514 | |||
1515 | 1 | if rule and rule[0].isdigit(): |
|
1516 | 1 | priority0 = int(rule[0]) |
|
1517 | 1 | rule = rule[1:] |
|
1518 | |||
1519 | 1 | if ( |
|
1520 | not rule |
||
1521 | or |
||
1522 | # rule == '^' is not possible here |
||
1523 | ( |
||
1524 | (rule[0] == '$') |
||
1525 | and not src[ |
||
1526 | i + matches0 : i + matches0 + 1 |
||
1527 | ].isalpha() |
||
1528 | and ( |
||
1529 | src[ |
||
1530 | i + matches0 : i + matches0 + 1 |
||
1531 | ] |
||
1532 | != '.' |
||
1533 | ) |
||
1534 | ) |
||
1535 | ): |
||
1536 | 1 | if matches0 == matches: |
|
1537 | # this is only a partial string |
||
1538 | 1 | pos0 += 3 |
|
1539 | 1 | continue |
|
1540 | |||
1541 | 1 | if priority0 < priority: |
|
1542 | # priority is too low |
||
1543 | 1 | pos0 += 3 |
|
1544 | 1 | continue |
|
1545 | |||
1546 | # continuation rule found |
||
1547 | 1 | break |
|
1548 | |||
1549 | 1 | pos0 += 3 |
|
1550 | |||
1551 | # end of "while" |
||
1552 | 1 | if (priority0 >= priority) and ( |
|
1553 | (_phonet_rules[pos0] is not None) |
||
1554 | and (_phonet_rules[pos0][0] == char0) |
||
1555 | ): |
||
1556 | |||
1557 | 1 | pos += 3 |
|
1558 | 1 | continue |
|
1559 | |||
1560 | # replace string |
||
1561 | 1 | if _phonet_rules[pos] and ( |
|
1562 | '<' in _phonet_rules[pos][1:] |
||
1563 | ): |
||
1564 | 1 | priority0 = 1 |
|
1565 | else: |
||
1566 | 1 | priority0 = 0 |
|
1567 | |||
1568 | 1 | rule = _phonet_rules[pos + mode] |
|
1569 | |||
1570 | 1 | if (priority0 == 1) and (zeta == 0): |
|
1571 | # rule with '<' is applied |
||
1572 | 1 | if ( |
|
1573 | (j > 0) |
||
1574 | and rule |
||
1575 | and ( |
||
1576 | (dest[j - 1] == char) |
||
1577 | or (dest[j - 1] == rule[0]) |
||
1578 | ) |
||
1579 | ): |
||
1580 | 1 | j -= 1 |
|
1581 | |||
1582 | 1 | zeta0 = 1 |
|
1583 | 1 | zeta += 1 |
|
1584 | 1 | matches0 = 0 |
|
1585 | |||
1586 | 1 | while rule and src[i + matches0]: |
|
1587 | 1 | src = ( |
|
1588 | src[0 : i + matches0] |
||
1589 | + rule[0] |
||
1590 | + src[i + matches0 + 1 :] |
||
1591 | ) |
||
1592 | 1 | matches0 += 1 |
|
1593 | 1 | rule = rule[1:] |
|
1594 | |||
1595 | 1 | if matches0 < matches: |
|
1596 | 1 | src = ( |
|
1597 | src[0 : i + matches0] + src[i + matches :] |
||
1598 | ) |
||
1599 | |||
1600 | 1 | char = src[i] |
|
1601 | else: |
||
1602 | 1 | i = i + matches - 1 |
|
1603 | 1 | zeta = 0 |
|
1604 | |||
1605 | 1 | while len(rule) > 1: |
|
1606 | 1 | if (j == 0) or (dest[j - 1] != rule[0]): |
|
1607 | 1 | dest = ( |
|
1608 | dest[0:j] |
||
1609 | + rule[0] |
||
1610 | + dest[min(len(dest), j + 1) :] |
||
1611 | ) |
||
1612 | 1 | j += 1 |
|
1613 | |||
1614 | 1 | rule = rule[1:] |
|
1615 | |||
1616 | # new "current char" |
||
1617 | 1 | if not rule: |
|
1618 | 1 | rule = '' |
|
1619 | 1 | char = '' |
|
1620 | else: |
||
1621 | 1 | char = rule[0] |
|
1622 | |||
1623 | 1 | if ( |
|
1624 | _phonet_rules[pos] |
||
1625 | and '^^' in _phonet_rules[pos][1:] |
||
1626 | ): |
||
1627 | 1 | if char: |
|
1628 | 1 | dest = ( |
|
1629 | dest[0:j] |
||
1630 | + char |
||
1631 | + dest[min(len(dest), j + 1) :] |
||
1632 | ) |
||
1633 | 1 | j += 1 |
|
1634 | |||
1635 | 1 | src = src[i + 1 :] |
|
1636 | 1 | i = 0 |
|
1637 | 1 | zeta0 = 1 |
|
1638 | |||
1639 | 1 | break |
|
1640 | |||
1641 | 1 | pos += 3 |
|
1642 | |||
1643 | 1 | if pos > end1 and start2 > 0: |
|
1644 | 1 | pos = start2 |
|
1645 | 1 | start1 = start2 |
|
1646 | 1 | end1 = end2 |
|
1647 | 1 | start2 = -1 |
|
1648 | 1 | end2 = -1 |
|
1649 | |||
1650 | 1 | if zeta0 == 0: |
|
1651 | 1 | if char and ((j == 0) or (dest[j - 1] != char)): |
|
1652 | # delete multiple letters only |
||
1653 | 1 | dest = dest[0:j] + char + dest[min(j + 1, term_length) :] |
|
1654 | 1 | j += 1 |
|
1655 | |||
1656 | 1 | i += 1 |
|
1657 | 1 | zeta = 0 |
|
1658 | |||
1659 | 1 | dest = dest[0:j] |
|
1660 | |||
1661 | 1 | return dest |
|
1662 | |||
1663 | 1 | _initialize_phonet(lang) |
|
1664 | |||
1665 | 1 | word = unicode_normalize('NFKC', text_type(word)) |
|
1666 | 1 | return _phonet(word, mode, lang) |
|
1667 | |||
1673 |