Completed
Push — master ( f43547...71985b )
by Chris
12:00 queued 10s
created

abydos.phonetic._phonet.phonet()   A

Complexity

Conditions 1

Size

Total Lines 50
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 3
dl 0
loc 50
ccs 2
cts 2
cp 1
crap 1
rs 10
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
0 ignored issues
show
coding-style introduced by
Too many lines in module (1780/1000)
Loading history...
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._phonet.
20
21
phonet algorithm (a.k.a. Hannoveraner Phonetik), intended chiefly for German
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from collections import Counter
32 1
from unicodedata import normalize as unicode_normalize
33
34 1
from six import text_type
35 1
from six.moves import range
36
37 1
from ._phonetic import _Phonetic
38
39 1
__all__ = ['Phonet', 'phonet']
40
41
42 1
class Phonet(_Phonetic):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
43
    """Phonet code.
44
45
    phonet ("Hannoveraner Phonetik") was developed by Jörg Michael and
46
    documented in :cite:`Michael:1999`.
47
48
    This is a port of Jesper Zedlitz's code, which is licensed LGPL
49
    :cite:`Zedlitz:2015`.
50
51
    That is, in turn, based on Michael's C code, which is also licensed LGPL
52
    :cite:`Michael:2007`.
53
    """
54
55 1
    _rules_no_lang = (  # separator chars
56
        # fmt: off
57
        '´', ' ', ' ',
58
        '"', ' ', ' ',
59
        '`$', '', '',
60
        '\'', ' ', ' ',
61
        ',', ',', ',',
62
        ';', ',', ',',
63
        '-', ' ', ' ',
64
        ' ', ' ', ' ',
65
        '.', '.', '.',
66
        ':', '.', '.',
67
        # German umlauts
68
        'Ä', 'AE', 'AE',
69
        'Ö', 'OE', 'OE',
70
        'Ü', 'UE', 'UE',
71
        'ß', 'S', 'S',
72
        # international umlauts
73
        'À', 'A', 'A',
74
        'Á', 'A', 'A',
75
        'Â', 'A', 'A',
76
        'Ã', 'A', 'A',
77
        'Å', 'A', 'A',
78
        'Æ', 'AE', 'AE',
79
        'Ç', 'C', 'C',
80
        'Ð', 'DJ', 'DJ',
81
        'È', 'E', 'E',
82
        'É', 'E', 'E',
83
        'Ê', 'E', 'E',
84
        'Ë', 'E', 'E',
85
        'Ì', 'I', 'I',
86
        'Í', 'I', 'I',
87
        'Î', 'I', 'I',
88
        'Ï', 'I', 'I',
89
        'Ñ', 'NH', 'NH',
90
        'Ò', 'O', 'O',
91
        'Ó', 'O', 'O',
92
        'Ô', 'O', 'O',
93
        'Õ', 'O', 'O',
94
        'Œ', 'OE', 'OE',
95
        'Ø', 'OE', 'OE',
96
        'Š', 'SH', 'SH',
97
        'Þ', 'TH', 'TH',
98
        'Ù', 'U', 'U',
99
        'Ú', 'U', 'U',
100
        'Û', 'U', 'U',
101
        'Ý', 'Y', 'Y',
102
        'Ÿ', 'Y', 'Y',
103
        # 'normal' letters (A-Z)
104
        'MC^', 'MAC', 'MAC',
105
        'MC^', 'MAC', 'MAC',
106
        'M´^', 'MAC', 'MAC',
107
        'M\'^', 'MAC', 'MAC',
108
        'O´^', 'O', 'O',
109
        'O\'^', 'O', 'O',
110
        'VAN DEN ^', 'VANDEN', 'VANDEN',
111
        None, None, None
112
        # fmt: on
113
    )
114
115 1
    _rules_german = (  # separator chars
116
        # fmt: off
117
        '´', ' ', ' ',
118
        '"', ' ', ' ',
119
        '`$', '', '',
120
        '\'', ' ', ' ',
121
        ',', ' ', ' ',
122
        ';', ' ', ' ',
123
        '-', ' ', ' ',
124
        ' ', ' ', ' ',
125
        '.', '.', '.',
126
        ':', '.', '.',
127
        # German umlauts
128
        'ÄE', 'E', 'E',
129
        'ÄU<', 'EU', 'EU',
130
        'ÄV(AEOU)-<', 'EW', None,
131
        'Ä$', 'Ä', None,
132
        'Ä<', None, 'E',
133
        'Ä', 'E', None,
134
        'ÖE', 'Ö', 'Ö',
135
        'ÖU', 'Ö', 'Ö',
136
        'ÖVER--<', 'ÖW', None,
137
        'ÖV(AOU)-', 'ÖW', None,
138
        'ÜBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
139
        'ÜBER^^', 'ÜBA', 'IBA',
140
        'ÜE', 'Ü', 'I',
141
        'ÜVER--<', 'ÜW', None,
142
        'ÜV(AOU)-', 'ÜW', None,
143
        'Ü', None, 'I',
144
        'ßCH<', None, 'Z',
145
        'ß<', 'S', 'Z',
146
        # international umlauts
147
        'À<', 'A', 'A',
148
        'Á<', 'A', 'A',
149
        'Â<', 'A', 'A',
150
        'Ã<', 'A', 'A',
151
        'Å<', 'A', 'A',
152
        'ÆER-', 'E', 'E',
153
        'ÆU<', 'EU', 'EU',
154
        'ÆV(AEOU)-<', 'EW', None,
155
        'Æ$', 'Ä', None,
156
        'Æ<', None, 'E',
157
        'Æ', 'E', None,
158
        'Ç', 'Z', 'Z',
159
        'ÐÐ-', '', '',
160
        'Ð', 'DI', 'TI',
161
        'È<', 'E', 'E',
162
        'É<', 'E', 'E',
163
        'Ê<', 'E', 'E',
164
        'Ë', 'E', 'E',
165
        'Ì<', 'I', 'I',
166
        'Í<', 'I', 'I',
167
        'Î<', 'I', 'I',
168
        'Ï', 'I', 'I',
169
        'ÑÑ-', '', '',
170
        'Ñ', 'NI', 'NI',
171
        'Ò<', 'O', 'U',
172
        'Ó<', 'O', 'U',
173
        'Ô<', 'O', 'U',
174
        'Õ<', 'O', 'U',
175
        'Œ<', 'Ö', 'Ö',
176
        'Ø(IJY)-<', 'E', 'E',
177
        'Ø<', 'Ö', 'Ö',
178
        'Š', 'SH', 'Z',
179
        'Þ', 'T', 'T',
180
        'Ù<', 'U', 'U',
181
        'Ú<', 'U', 'U',
182
        'Û<', 'U', 'U',
183
        'Ý<', 'I', 'I',
184
        'Ÿ<', 'I', 'I',
185
        # 'normal' letters (A-Z)
186
        'ABELLE$', 'ABL', 'ABL',
187
        'ABELL$', 'ABL', 'ABL',
188
        'ABIENNE$', 'ABIN', 'ABIN',
189
        'ACHME---^', 'ACH', 'AK',
190
        'ACEY$', 'AZI', 'AZI',
191
        'ADV', 'ATW', None,
192
        'AEGL-', 'EK', None,
193
        'AEU<', 'EU', 'EU',
194
        'AE2', 'E', 'E',
195
        'AFTRAUBEN------', 'AFT ', 'AFT ',
196
        'AGL-1', 'AK', None,
197
        'AGNI-^', 'AKN', 'AKN',
198
        'AGNIE-', 'ANI', 'ANI',
199
        'AGN(AEOU)-$', 'ANI', 'ANI',
200
        'AH(AIOÖUÜY)-', 'AH', None,
201
        'AIA2', 'AIA', 'AIA',
202
        'AIE$', 'E', 'E',
203
        'AILL(EOU)-', 'ALI', 'ALI',
204
        'AINE$', 'EN', 'EN',
205
        'AIRE$', 'ER', 'ER',
206
        'AIR-', 'E', 'E',
207
        'AISE$', 'ES', 'EZ',
208
        'AISSANCE$', 'ESANS', 'EZANZ',
209
        'AISSE$', 'ES', 'EZ',
210
        'AIX$', 'EX', 'EX',
211
        'AJ(AÄEÈÉÊIOÖUÜ)--', 'A', 'A',
212
        'AKTIE', 'AXIE', 'AXIE',
213
        'AKTUEL', 'AKTUEL', None,
214
        'ALOI^', 'ALOI', 'ALUI',  # Don't merge these rules
215
        'ALOY^', 'ALOI', 'ALUI',  # needed by 'check_rules'
216
        'AMATEU(RS)-', 'AMATÖ', 'ANATÖ',
217
        'ANCH(OEI)-', 'ANSH', 'ANZ',
218
        'ANDERGEGANG----', 'ANDA GE', 'ANTA KE',
219
        'ANDERGEHE----', 'ANDA ', 'ANTA ',
220
        'ANDERGESETZ----', 'ANDA GE', 'ANTA KE',
221
        'ANDERGING----', 'ANDA ', 'ANTA ',
222
        'ANDERSETZ(ET)-----', 'ANDA ', 'ANTA ',
223
        'ANDERZUGEHE----', 'ANDA ZU ', 'ANTA ZU ',
224
        'ANDERZUSETZE-----', 'ANDA ZU ', 'ANTA ZU ',
225
        'ANER(BKO)---^^', 'AN', None,
226
        'ANHAND---^$', 'AN H', 'AN ',
227
        'ANH(AÄEIOÖUÜY)--^^', 'AN', None,
228
        'ANIELLE$', 'ANIEL', 'ANIL',
229
        'ANIEL', 'ANIEL', None,
230
        'ANSTELLE----^$', 'AN ST', 'AN ZT',
231
        'ANTI^^', 'ANTI', 'ANTI',
232
        'ANVER^^', 'ANFA', 'ANFA',
233
        'ATIA$', 'ATIA', 'ATIA',
234
        'ATIA(NS)--', 'ATI', 'ATI',
235
        'ATI(AÄOÖUÜ)-', 'AZI', 'AZI',
236
        'AUAU--', '', '',
237
        'AUERE$', 'AUERE', None,
238
        'AUERE(NS)-$', 'AUERE', None,
239
        'AUERE(AIOUY)--', 'AUER', None,
240
        'AUER(AÄIOÖUÜY)-', 'AUER', None,
241
        'AUER<', 'AUA', 'AUA',
242
        'AUF^^', 'AUF', 'AUF',
243
        'AULT$', 'O', 'U',
244
        'AUR(BCDFGKLMNQSTVWZ)-', 'AUA', 'AUA',
245
        'AUR$', 'AUA', 'AUA',
246
        'AUSSE$', 'OS', 'UZ',
247
        'AUS(ST)-^', 'AUS', 'AUS',
248
        'AUS^^', 'AUS', 'AUS',
249
        'AUTOFAHR----', 'AUTO ', 'AUTU ',
250
        'AUTO^^', 'AUTO', 'AUTU',
251
        'AUX(IY)-', 'AUX', 'AUX',
252
        'AUX', 'O', 'U',
253
        'AU', 'AU', 'AU',
254
        'AVER--<', 'AW', None,
255
        'AVIER$', 'AWIE', 'AFIE',
256
        'AV(EÈÉÊI)-^', 'AW', None,
257
        'AV(AOU)-', 'AW', None,
258
        'AYRE$', 'EIRE', 'EIRE',
259
        'AYRE(NS)-$', 'EIRE', 'EIRE',
260
        'AYRE(AIOUY)--', 'EIR', 'EIR',
261
        'AYR(AÄIOÖUÜY)-', 'EIR', 'EIR',
262
        'AYR<', 'EIA', 'EIA',
263
        'AYER--<', 'EI', 'EI',
264
        'AY(AÄEIOÖUÜY)--', 'A', 'A',
265
        'AË', 'E', 'E',
266
        'A(IJY)<', 'EI', 'EI',
267
        'BABY^$', 'BEBI', 'BEBI',
268
        'BAB(IY)^', 'BEBI', 'BEBI',
269
        'BEAU^$', 'BO', None,
270
        'BEA(BCMNRU)-^', 'BEA', 'BEA',
271
        'BEAT(AEIMORU)-^', 'BEAT', 'BEAT',
272
        'BEE$', 'BI', 'BI',
273
        'BEIGE^$', 'BESH', 'BEZ',
274
        'BENOIT--', 'BENO', 'BENU',
275
        'BER(DT)-', 'BER', None,
276
        'BERN(DT)-', 'BERN', None,
277
        'BE(LMNRST)-^', 'BE', 'BE',
278
        'BETTE$', 'BET', 'BET',
279
        'BEVOR^$', 'BEFOR', None,
280
        'BIC$', 'BIZ', 'BIZ',
281
        'BOWL(EI)-', 'BOL', 'BUL',
282
        'BP(AÄEÈÉÊIÌÍÎOÖRUÜY)-', 'B', 'B',
283
        'BRINGEND-----^', 'BRI', 'BRI',
284
        'BRINGEND-----', ' BRI', ' BRI',
285
        'BROW(NS)-', 'BRAU', 'BRAU',
286
        'BUDGET7', 'BÜGE', 'BIKE',
287
        'BUFFET7', 'BÜFE', 'BIFE',
288
        'BYLLE$', 'BILE', 'BILE',
289
        'BYLL$', 'BIL', 'BIL',
290
        'BYPA--^', 'BEI', 'BEI',
291
        'BYTE<', 'BEIT', 'BEIT',
292
        'BY9^', 'BÜ', None,
293
        'B(SßZ)$', 'BS', None,
294
        'CACH(EI)-^', 'KESH', 'KEZ',
295
        'CAE--', 'Z', 'Z',
296
        'CA(IY)$', 'ZEI', 'ZEI',
297
        'CE(EIJUY)--', 'Z', 'Z',
298
        'CENT<', 'ZENT', 'ZENT',
299
        'CERST(EI)----^', 'KE', 'KE',
300
        'CER$', 'ZA', 'ZA',
301
        'CE3', 'ZE', 'ZE',
302
        'CH\'S$', 'X', 'X',
303
        'CH´S$', 'X', 'X',
304
        'CHAO(ST)-', 'KAO', 'KAU',
305
        'CHAMPIO-^', 'SHEMPI', 'ZENBI',
306
        'CHAR(AI)-^', 'KAR', 'KAR',
307
        'CHAU(CDFSVWXZ)-', 'SHO', 'ZU',
308
        'CHÄ(CF)-', 'SHE', 'ZE',
309
        'CHE(CF)-', 'SHE', 'ZE',
310
        'CHEM-^', 'KE', 'KE',  # or: 'CHE', 'KE'
311
        'CHEQUE<', 'SHEK', 'ZEK',
312
        'CHI(CFGPVW)-', 'SHI', 'ZI',
313
        'CH(AEUY)-<^', 'SH', 'Z',
314
        'CHK-', '', '',
315
        'CHO(CKPS)-^', 'SHO', 'ZU',
316
        'CHRIS-', 'KRI', None,
317
        'CHRO-', 'KR', None,
318
        'CH(LOR)-<^', 'K', 'K',
319
        'CHST-', 'X', 'X',
320
        'CH(SßXZ)3', 'X', 'X',
321
        'CHTNI-3', 'CHN', 'KN',
322
        'CH^', 'K', 'K',  # or: 'CH', 'K'
323
        'CH', 'CH', 'K',
324
        'CIC$', 'ZIZ', 'ZIZ',
325
        'CIENCEFICT----', 'EIENS ', 'EIENZ ',
326
        'CIENCE$', 'EIENS', 'EIENZ',
327
        'CIER$', 'ZIE', 'ZIE',
328
        'CYB-^', 'ZEI', 'ZEI',
329
        'CY9^', 'ZÜ', 'ZI',
330
        'C(IJY)-<3', 'Z', 'Z',
331
        'CLOWN-', 'KLAU', 'KLAU',
332
        'CCH', 'Z', 'Z',
333
        'CCE-', 'X', 'X',
334
        'C(CK)-', '', '',
335
        'CLAUDET---', 'KLO', 'KLU',
336
        'CLAUDINE^$', 'KLODIN', 'KLUTIN',
337
        'COACH', 'KOSH', 'KUZ',
338
        'COLE$', 'KOL', 'KUL',
339
        'COUCH', 'KAUSH', 'KAUZ',
340
        'COW', 'KAU', 'KAU',
341
        'CQUES$', 'K', 'K',
342
        'CQUE', 'K', 'K',
343
        'CRASH--9', 'KRE', 'KRE',
344
        'CREAT-^', 'KREA', 'KREA',
345
        'CST', 'XT', 'XT',
346
        'CS<^', 'Z', 'Z',
347
        'C(SßX)', 'X', 'X',
348
        'CT\'S$', 'X', 'X',
349
        'CT(SßXZ)', 'X', 'X',
350
        'CZ<', 'Z', 'Z',
351
        'C(ÈÉÊÌÍÎÝ)3', 'Z', 'Z',
352
        'C.^', 'C.', 'C.',
353
        'CÄ-', 'Z', 'Z',
354
        'CÜ$', 'ZÜ', 'ZI',
355
        'C\'S$', 'X', 'X',
356
        'C<', 'K', 'K',
357
        'DAHER^$', 'DAHER', None,
358
        'DARAUFFOLGE-----', 'DARAUF ', 'TARAUF ',
359
        'DAVO(NR)-^$', 'DAFO', 'TAFU',
360
        'DD(SZ)--<', '', '',
361
        'DD9', 'D', None,
362
        'DEPOT7', 'DEPO', 'TEBU',
363
        'DESIGN', 'DISEIN', 'TIZEIN',
364
        'DE(LMNRST)-3^', 'DE', 'TE',
365
        'DETTE$', 'DET', 'TET',
366
        'DH$', 'T', None,
367
        'DIC$', 'DIZ', 'TIZ',
368
        'DIDR-^', 'DIT', None,
369
        'DIEDR-^', 'DIT', None,
370
        'DJ(AEIOU)-^', 'I', 'I',
371
        'DMITR-^', 'DIMIT', 'TINIT',
372
        'DRY9^', 'DRÜ', None,
373
        'DT-', '', '',
374
        'DUIS-^', 'DÜ', 'TI',
375
        'DURCH^^', 'DURCH', 'TURK',
376
        'DVA$', 'TWA', None,
377
        'DY9^', 'DÜ', None,
378
        'DYS$', 'DIS', None,
379
        'DS(CH)--<', 'T', 'T',
380
        'DST', 'ZT', 'ZT',
381
        'DZS(CH)--', 'T', 'T',
382
        'D(SßZ)', 'Z', 'Z',
383
        'D(AÄEIOÖRUÜY)-', 'D', None,
384
        'D(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'D', None,
385
        'D\'H^', 'D', 'T',
386
        'D´H^', 'D', 'T',
387
        'D`H^', 'D', 'T',
388
        'D\'S3$', 'Z', 'Z',
389
        'D´S3$', 'Z', 'Z',
390
        'D^', 'D', None,
391
        'D', 'T', 'T',
392
        'EAULT$', 'O', 'U',
393
        'EAUX$', 'O', 'U',
394
        'EAU', 'O', 'U',
395
        'EAV', 'IW', 'IF',
396
        'EAS3$', 'EAS', None,
397
        'EA(AÄEIOÖÜY)-3', 'EA', 'EA',
398
        'EA3$', 'EA', 'EA',
399
        'EA3', 'I', 'I',
400
        'EBENSO^$', 'EBNSO', 'EBNZU',
401
        'EBENSO^^', 'EBNSO ', 'EBNZU ',
402
        'EBEN^^', 'EBN', 'EBN',
403
        'EE9', 'E', 'E',
404
        'EGL-1', 'EK', None,
405
        'EHE(IUY)--1', 'EH', None,
406
        'EHUNG---1', 'E', None,
407
        'EH(AÄIOÖUÜY)-1', 'EH', None,
408
        'EIEI--', '', '',
409
        'EIERE^$', 'EIERE', None,
410
        'EIERE$', 'EIERE', None,
411
        'EIERE(NS)-$', 'EIERE', None,
412
        'EIERE(AIOUY)--', 'EIER', None,
413
        'EIER(AÄIOÖUÜY)-', 'EIER', None,
414
        'EIER<', 'EIA', None,
415
        'EIGL-1', 'EIK', None,
416
        'EIGH$', 'EI', 'EI',
417
        'EIH--', 'E', 'E',
418
        'EILLE$', 'EI', 'EI',
419
        'EIR(BCDFGKLMNQSTVWZ)-', 'EIA', 'EIA',
420
        'EIR$', 'EIA', 'EIA',
421
        'EITRAUBEN------', 'EIT ', 'EIT ',
422
        'EI', 'EI', 'EI',
423
        'EJ$', 'EI', 'EI',
424
        'ELIZ^', 'ELIS', None,
425
        'ELZ^', 'ELS', None,
426
        'EL-^', 'E', 'E',
427
        'ELANG----1', 'E', 'E',
428
        'EL(DKL)--1', 'E', 'E',
429
        'EL(MNT)--1$', 'E', 'E',
430
        'ELYNE$', 'ELINE', 'ELINE',
431
        'ELYN$', 'ELIN', 'ELIN',
432
        'EL(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'EL', 'EL',
433
        'EL-1', 'L', 'L',
434
        'EM-^', None, 'E',
435
        'EM(DFKMPQT)--1', None, 'E',
436
        'EM(AÄEÈÉÊIÌÍÎOÖUÜY)--1', None, 'E',
437
        'EM-1', None, 'N',
438
        'ENGAG-^', 'ANGA', 'ANKA',
439
        'EN-^', 'E', 'E',
440
        'ENTUEL', 'ENTUEL', None,
441
        'EN(CDGKQSTZ)--1', 'E', 'E',
442
        'EN(AÄEÈÉÊIÌÍÎNOÖUÜY)-1', 'EN', 'EN',
443
        'EN-1', '', '',
444
        'ERH(AÄEIOÖUÜ)-^', 'ERH', 'ER',
445
        'ER-^', 'E', 'E',
446
        'ERREGEND-----', ' ER', ' ER',
447
        'ERT1$', 'AT', None,
448
        'ER(DGLKMNRQTZß)-1', 'ER', None,
449
        'ER(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'ER', 'A',
450
        'ER1$', 'A', 'A',
451
        'ER<1', 'A', 'A',
452
        'ETAT7', 'ETA', 'ETA',
453
        'ETI(AÄOÖÜU)-', 'EZI', 'EZI',
454
        'EUERE$', 'EUERE', None,
455
        'EUERE(NS)-$', 'EUERE', None,
456
        'EUERE(AIOUY)--', 'EUER', None,
457
        'EUER(AÄIOÖUÜY)-', 'EUER', None,
458
        'EUER<', 'EUA', None,
459
        'EUEU--', '', '',
460
        'EUILLE$', 'Ö', 'Ö',
461
        'EUR$', 'ÖR', 'ÖR',
462
        'EUX', 'Ö', 'Ö',
463
        'EUSZ$', 'EUS', None,
464
        'EUTZ$', 'EUS', None,
465
        'EUYS$', 'EUS', 'EUZ',
466
        'EUZ$', 'EUS', None,
467
        'EU', 'EU', 'EU',
468
        'EVER--<1', 'EW', None,
469
        'EV(ÄOÖUÜ)-1', 'EW', None,
470
        'EYER<', 'EIA', 'EIA',
471
        'EY<', 'EI', 'EI',
472
        'FACETTE', 'FASET', 'FAZET',
473
        'FANS--^$', 'FE', 'FE',
474
        'FAN-^$', 'FE', 'FE',
475
        'FAULT-', 'FOL', 'FUL',
476
        'FEE(DL)-', 'FI', 'FI',
477
        'FEHLER', 'FELA', 'FELA',
478
        'FE(LMNRST)-3^', 'FE', 'FE',
479
        'FOERDERN---^', 'FÖRD', 'FÖRT',
480
        'FOERDERN---', ' FÖRD', ' FÖRT',
481
        'FOND7', 'FON', 'FUN',
482
        'FRAIN$', 'FRA', 'FRA',
483
        'FRISEU(RS)-', 'FRISÖ', 'FRIZÖ',
484
        'FY9^', 'FÜ', None,
485
        'FÖRDERN---^', 'FÖRD', 'FÖRT',
486
        'FÖRDERN---', ' FÖRD', ' FÖRT',
487
        'GAGS^$', 'GEX', 'KEX',
488
        'GAG^$', 'GEK', 'KEK',
489
        'GD', 'KT', 'KT',
490
        'GEGEN^^', 'GEGN', 'KEKN',
491
        'GEGENGEKOM-----', 'GEGN ', 'KEKN ',
492
        'GEGENGESET-----', 'GEGN ', 'KEKN ',
493
        'GEGENKOMME-----', 'GEGN ', 'KEKN ',
494
        'GEGENZUKOM---', 'GEGN ZU ', 'KEKN ZU ',
495
        'GENDETWAS-----$', 'GENT ', 'KENT ',
496
        'GENRE', 'IORE', 'IURE',
497
        'GE(LMNRST)-3^', 'GE', 'KE',
498
        'GER(DKT)-', 'GER', None,
499
        'GETTE$', 'GET', 'KET',
500
        'GGF.', 'GF.', None,
501
        'GG-', '', '',
502
        'GH', 'G', None,
503
        'GI(AOU)-^', 'I', 'I',
504
        'GION-3', 'KIO', 'KIU',
505
        'G(CK)-', '', '',
506
        'GJ(AEIOU)-^', 'I', 'I',
507
        'GMBH^$', 'GMBH', 'GMBH',
508
        'GNAC$', 'NIAK', 'NIAK',
509
        'GNON$', 'NION', 'NIUN',
510
        'GN$', 'N', 'N',
511
        'GONCAL-^', 'GONZA', 'KUNZA',
512
        'GRY9^', 'GRÜ', None,
513
        'G(SßXZ)-<', 'K', 'K',
514
        'GUCK-', 'KU', 'KU',
515
        'GUISEP-^', 'IUSE', 'IUZE',
516
        'GUI-^', 'G', 'K',
517
        'GUTAUSSEH------^', 'GUT ', 'KUT ',
518
        'GUTGEHEND------^', 'GUT ', 'KUT ',
519
        'GY9^', 'GÜ', None,
520
        'G(AÄEILOÖRUÜY)-', 'G', None,
521
        'G(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'G', None,
522
        'G\'S$', 'X', 'X',
523
        'G´S$', 'X', 'X',
524
        'G^', 'G', None,
525
        'G', 'K', 'K',
526
        'HA(HIUY)--1', 'H', None,
527
        'HANDVOL---^', 'HANT ', 'ANT ',
528
        'HANNOVE-^', 'HANOF', None,
529
        'HAVEN7$', 'HAFN', None,
530
        'HEAD-', 'HE', 'E',
531
        'HELIEGEN------', 'E ', 'E ',
532
        'HESTEHEN------', 'E ', 'E ',
533
        'HE(LMNRST)-3^', 'HE', 'E',
534
        'HE(LMN)-1', 'E', 'E',
535
        'HEUR1$', 'ÖR', 'ÖR',
536
        'HE(HIUY)--1', 'H', None,
537
        'HIH(AÄEIOÖUÜY)-1', 'IH', None,
538
        'HLH(AÄEIOÖUÜY)-1', 'LH', None,
539
        'HMH(AÄEIOÖUÜY)-1', 'MH', None,
540
        'HNH(AÄEIOÖUÜY)-1', 'NH', None,
541
        'HOBBY9^', 'HOBI', None,
542
        'HOCHBEGAB-----^', 'HOCH ', 'UK ',
543
        'HOCHTALEN-----^', 'HOCH ', 'UK ',
544
        'HOCHZUFRI-----^', 'HOCH ', 'UK ',
545
        'HO(HIY)--1', 'H', None,
546
        'HRH(AÄEIOÖUÜY)-1', 'RH', None,
547
        'HUH(AÄEIOÖUÜY)-1', 'UH', None,
548
        'HUIS^^', 'HÜS', 'IZ',
549
        'HUIS$', 'ÜS', 'IZ',
550
        'HUI--1', 'H', None,
551
        'HYGIEN^', 'HÜKIEN', None,
552
        'HY9^', 'HÜ', None,
553
        'HY(BDGMNPST)-', 'Ü', None,
554
        'H.^', None, 'H.',
555
        'HÄU--1', 'H', None,
556
        'H^', 'H', '',
557
        'H', '', '',
558
        'ICHELL---', 'ISH', 'IZ',
559
        'ICHI$', 'ISHI', 'IZI',
560
        'IEC$', 'IZ', 'IZ',
561
        'IEDENSTELLE------', 'IDN ', 'ITN ',
562
        'IEI-3', '', '',
563
        'IELL3', 'IEL', 'IEL',
564
        'IENNE$', 'IN', 'IN',
565
        'IERRE$', 'IER', 'IER',
566
        'IERZULAN---', 'IR ZU ', 'IR ZU ',
567
        'IETTE$', 'IT', 'IT',
568
        'IEU', 'IÖ', 'IÖ',
569
        'IE<4', 'I', 'I',
570
        'IGL-1', 'IK', None,
571
        'IGHT3$', 'EIT', 'EIT',
572
        'IGNI(EO)-', 'INI', 'INI',
573
        'IGN(AEOU)-$', 'INI', 'INI',
574
        'IHER(DGLKRT)--1', 'IHE', None,
575
        'IHE(IUY)--', 'IH', None,
576
        'IH(AIOÖUÜY)-', 'IH', None,
577
        'IJ(AOU)-', 'I', 'I',
578
        'IJ$', 'I', 'I',
579
        'IJ<', 'EI', 'EI',
580
        'IKOLE$', 'IKOL', 'IKUL',
581
        'ILLAN(STZ)--4', 'ILIA', 'ILIA',
582
        'ILLAR(DT)--4', 'ILIA', 'ILIA',
583
        'IMSTAN----^', 'IM ', 'IN ',
584
        'INDELERREGE------', 'INDL ', 'INTL ',
585
        'INFRAGE-----^$', 'IN ', 'IN ',
586
        'INTERN(AOU)-^', 'INTAN', 'INTAN',
587
        'INVER-', 'INWE', 'INFE',
588
        'ITI(AÄIOÖUÜ)-', 'IZI', 'IZI',
589
        'IUSZ$', 'IUS', None,
590
        'IUTZ$', 'IUS', None,
591
        'IUZ$', 'IUS', None,
592
        'IVER--<', 'IW', None,
593
        'IVIER$', 'IWIE', 'IFIE',
594
        'IV(ÄOÖUÜ)-', 'IW', None,
595
        'IV<3', 'IW', None,
596
        'IY2', 'I', None,
597
        'I(ÈÉÊ)<4', 'I', 'I',
598
        'JAVIE---<^', 'ZA', 'ZA',
599
        'JEANS^$', 'JINS', 'INZ',
600
        'JEANNE^$', 'IAN', 'IAN',
601
        'JEAN-^', 'IA', 'IA',
602
        'JER-^', 'IE', 'IE',
603
        'JE(LMNST)-', 'IE', 'IE',
604
        'JI^', 'JI', None,
605
        'JOR(GK)^$', 'IÖRK', 'IÖRK',
606
        'J', 'I', 'I',
607
        'KC(ÄEIJ)-', 'X', 'X',
608
        'KD', 'KT', None,
609
        'KE(LMNRST)-3^', 'KE', 'KE',
610
        'KG(AÄEILOÖRUÜY)-', 'K', None,
611
        'KH<^', 'K', 'K',
612
        'KIC$', 'KIZ', 'KIZ',
613
        'KLE(LMNRST)-3^', 'KLE', 'KLE',
614
        'KOTELE-^', 'KOTL', 'KUTL',
615
        'KREAT-^', 'KREA', 'KREA',
616
        'KRÜS(TZ)--^', 'KRI', None,
617
        'KRYS(TZ)--^', 'KRI', None,
618
        'KRY9^', 'KRÜ', None,
619
        'KSCH---', 'K', 'K',
620
        'KSH--', 'K', 'K',
621
        'K(SßXZ)7', 'X', 'X',  # implies 'KST' -> 'XT'
622
        'KT\'S$', 'X', 'X',
623
        'KTI(AIOU)-3', 'XI', 'XI',
624
        'KT(SßXZ)', 'X', 'X',
625
        'KY9^', 'KÜ', None,
626
        'K\'S$', 'X', 'X',
627
        'K´S$', 'X', 'X',
628
        'LANGES$', ' LANGES', ' LANKEZ',
629
        'LANGE$', ' LANGE', ' LANKE',
630
        'LANG$', ' LANK', ' LANK',
631
        'LARVE-', 'LARF', 'LARF',
632
        'LD(SßZ)$', 'LS', 'LZ',
633
        'LD\'S$', 'LS', 'LZ',
634
        'LD´S$', 'LS', 'LZ',
635
        'LEAND-^', 'LEAN', 'LEAN',
636
        'LEERSTEHE-----^', 'LER ', 'LER ',
637
        'LEICHBLEIB-----', 'LEICH ', 'LEIK ',
638
        'LEICHLAUTE-----', 'LEICH ', 'LEIK ',
639
        'LEIDERREGE------', 'LEIT ', 'LEIT ',
640
        'LEIDGEPR----^', 'LEIT ', 'LEIT ',
641
        'LEINSTEHE-----', 'LEIN ', 'LEIN ',
642
        'LEL-', 'LE', 'LE',
643
        'LE(MNRST)-3^', 'LE', 'LE',
644
        'LETTE$', 'LET', 'LET',
645
        'LFGNAG-', 'LFGAN', 'LFKAN',
646
        'LICHERWEIS----', 'LICHA ', 'LIKA ',
647
        'LIC$', 'LIZ', 'LIZ',
648
        'LIVE^$', 'LEIF', 'LEIF',
649
        'LT(SßZ)$', 'LS', 'LZ',
650
        'LT\'S$', 'LS', 'LZ',
651
        'LT´S$', 'LS', 'LZ',
652
        'LUI(GS)--', 'LU', 'LU',
653
        'LV(AIO)-', 'LW', None,
654
        'LY9^', 'LÜ', None,
655
        'LSTS$', 'LS', 'LZ',
656
        'LZ(BDFGKLMNPQRSTVWX)-', 'LS', None,
657
        'L(SßZ)$', 'LS', None,
658
        'MAIR-<', 'MEI', 'NEI',
659
        'MANAG-', 'MENE', 'NENE',
660
        'MANUEL', 'MANUEL', None,
661
        'MASSEU(RS)-', 'MASÖ', 'NAZÖ',
662
        'MATCH', 'MESH', 'NEZ',
663
        'MAURICE', 'MORIS', 'NURIZ',
664
        'MBH^$', 'MBH', 'MBH',
665
        'MB(ßZ)$', 'MS', None,
666
        'MB(SßTZ)-', 'M', 'N',
667
        'MCG9^', 'MAK', 'NAK',
668
        'MC9^', 'MAK', 'NAK',
669
        'MEMOIR-^', 'MEMOA', 'NENUA',
670
        'MERHAVEN$', 'MAHAFN', None,
671
        'ME(LMNRST)-3^', 'ME', 'NE',
672
        'MEN(STZ)--3', 'ME', None,
673
        'MEN$', 'MEN', None,
674
        'MIGUEL-', 'MIGE', 'NIKE',
675
        'MIKE^$', 'MEIK', 'NEIK',
676
        'MITHILFE----^$', 'MIT H', 'NIT ',
677
        'MN$', 'M', None,
678
        'MN', 'N', 'N',
679
        'MPJUTE-', 'MPUT', 'NBUT',
680
        'MP(ßZ)$', 'MS', None,
681
        'MP(SßTZ)-', 'M', 'N',
682
        'MP(BDJLMNPQVW)-', 'MB', 'NB',
683
        'MY9^', 'MÜ', None,
684
        'M(ßZ)$', 'MS', None,
685
        'M´G7^', 'MAK', 'NAK',
686
        'M\'G7^', 'MAK', 'NAK',
687
        'M´^', 'MAK', 'NAK',
688
        'M\'^', 'MAK', 'NAK',
689
        'M', None, 'N',
690
        'NACH^^', 'NACH', 'NAK',
691
        'NADINE', 'NADIN', 'NATIN',
692
        'NAIV--', 'NA', 'NA',
693
        'NAISE$', 'NESE', 'NEZE',
694
        'NAUGENOMM------', 'NAU ', 'NAU ',
695
        'NAUSOGUT$', 'NAUSO GUT', 'NAUZU KUT',
696
        'NCH$', 'NSH', 'NZ',
697
        'NCOISE$', 'SOA', 'ZUA',
698
        'NCOIS$', 'SOA', 'ZUA',
699
        'NDAR$', 'NDA', 'NTA',
700
        'NDERINGEN------', 'NDE ', 'NTE ',
701
        'NDRO(CDKTZ)-', 'NTRO', None,
702
        'ND(BFGJLMNPQVW)-', 'NT', None,
703
        'ND(SßZ)$', 'NS', 'NZ',
704
        'ND\'S$', 'NS', 'NZ',
705
        'ND´S$', 'NS', 'NZ',
706
        'NEBEN^^', 'NEBN', 'NEBN',
707
        'NENGELERN------', 'NEN ', 'NEN ',
708
        'NENLERN(ET)---', 'NEN LE', 'NEN LE',
709
        'NENZULERNE---', 'NEN ZU LE', 'NEN ZU LE',
710
        'NE(LMNRST)-3^', 'NE', 'NE',
711
        'NEN-3', 'NE', 'NE',
712
        'NETTE$', 'NET', 'NET',
713
        'NGU^^', 'NU', 'NU',
714
        'NG(BDFJLMNPQRTVW)-', 'NK', 'NK',
715
        'NH(AUO)-$', 'NI', 'NI',
716
        'NICHTSAHNEN-----', 'NIX ', 'NIX ',
717
        'NICHTSSAGE----', 'NIX ', 'NIX ',
718
        'NICHTS^^', 'NIX', 'NIX',
719
        'NICHT^^', 'NICHT', 'NIKT',
720
        'NINE$', 'NIN', 'NIN',
721
        'NON^^', 'NON', 'NUN',
722
        'NOTLEIDE-----^', 'NOT ', 'NUT ',
723
        'NOT^^', 'NOT', 'NUT',
724
        'NTI(AIOU)-3', 'NZI', 'NZI',
725
        'NTIEL--3', 'NZI', 'NZI',
726
        'NT(SßZ)$', 'NS', 'NZ',
727
        'NT\'S$', 'NS', 'NZ',
728
        'NT´S$', 'NS', 'NZ',
729
        'NYLON', 'NEILON', 'NEILUN',
730
        'NY9^', 'NÜ', None,
731
        'NSTZUNEH---', 'NST ZU ', 'NZT ZU ',
732
        'NSZ-', 'NS', None,
733
        'NSTS$', 'NS', 'NZ',
734
        'NZ(BDFGKLMNPQRSTVWX)-', 'NS', None,
735
        'N(SßZ)$', 'NS', None,
736
        'OBERE-', 'OBER', None,
737
        'OBER^^', 'OBA', 'UBA',
738
        'OEU2', 'Ö', 'Ö',
739
        'OE<2', 'Ö', 'Ö',
740
        'OGL-', 'OK', None,
741
        'OGNIE-', 'ONI', 'UNI',
742
        'OGN(AEOU)-$', 'ONI', 'UNI',
743
        'OH(AIOÖUÜY)-', 'OH', None,
744
        'OIE$', 'Ö', 'Ö',
745
        'OIRE$', 'OA', 'UA',
746
        'OIR$', 'OA', 'UA',
747
        'OIX', 'OA', 'UA',
748
        'OI<3', 'EU', 'EU',
749
        'OKAY^$', 'OKE', 'UKE',
750
        'OLYN$', 'OLIN', 'ULIN',
751
        'OO(DLMZ)-', 'U', None,
752
        'OO$', 'U', None,
753
        'OO-', '', '',
754
        'ORGINAL-----', 'ORI', 'URI',
755
        'OTI(AÄOÖUÜ)-', 'OZI', 'UZI',
756
        'OUI^', 'WI', 'FI',
757
        'OUILLE$', 'ULIE', 'ULIE',
758
        'OU(DT)-^', 'AU', 'AU',
759
        'OUSE$', 'AUS', 'AUZ',
760
        'OUT-', 'AU', 'AU',
761
        'OU', 'U', 'U',
762
        'O(FV)$', 'AU', 'AU',  # due to 'OW$' -> 'AU'
763
        'OVER--<', 'OW', None,
764
        'OV(AOU)-', 'OW', None,
765
        'OW$', 'AU', 'AU',
766
        'OWS$', 'OS', 'UZ',
767
        'OJ(AÄEIOÖUÜ)--', 'O', 'U',
768
        'OYER', 'OIA', None,
769
        'OY(AÄEIOÖUÜ)--', 'O', 'U',
770
        'O(JY)<', 'EU', 'EU',
771
        'OZ$', 'OS', None,
772
        'O´^', 'O', 'U',
773
        'O\'^', 'O', 'U',
774
        'O', None, 'U',
775
        'PATIEN--^', 'PAZI', 'PAZI',
776
        'PENSIO-^', 'PANSI', 'PANZI',
777
        'PE(LMNRST)-3^', 'PE', 'PE',
778
        'PFER-^', 'FE', 'FE',
779
        'P(FH)<', 'F', 'F',
780
        'PIC^$', 'PIK', 'PIK',
781
        'PIC$', 'PIZ', 'PIZ',
782
        'PIPELINE', 'PEIBLEIN', 'PEIBLEIN',
783
        'POLYP-', 'POLÜ', None,
784
        'POLY^^', 'POLI', 'PULI',
785
        'PORTRAIT7', 'PORTRE', 'PURTRE',
786
        'POWER7', 'PAUA', 'PAUA',
787
        'PP(FH)--<', 'B', 'B',
788
        'PP-', '', '',
789
        'PRODUZ-^', 'PRODU', 'BRUTU',
790
        'PRODUZI--', ' PRODU', ' BRUTU',
791
        'PRIX^$', 'PRI', 'PRI',
792
        'PS-^^', 'P', None,
793
        'P(SßZ)^', None, 'Z',
794
        'P(SßZ)$', 'BS', None,
795
        'PT-^', '', '',
796
        'PTI(AÄOÖUÜ)-3', 'BZI', 'BZI',
797
        'PY9^', 'PÜ', None,
798
        'P(AÄEIOÖRUÜY)-', 'P', 'P',
799
        'P(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'P', None,
800
        'P.^', None, 'P.',
801
        'P^', 'P', None,
802
        'P', 'B', 'B',
803
        'QI-', 'Z', 'Z',
804
        'QUARANT--', 'KARA', 'KARA',
805
        'QUE(LMNRST)-3', 'KWE', 'KFE',
806
        'QUE$', 'K', 'K',
807
        'QUI(NS)$', 'KI', 'KI',
808
        'QUIZ7', 'KWIS', None,
809
        'Q(UV)7', 'KW', 'KF',
810
        'Q<', 'K', 'K',
811
        'RADFAHR----', 'RAT ', 'RAT ',
812
        'RAEFTEZEHRE-----', 'REFTE ', 'REFTE ',
813
        'RCH', 'RCH', 'RK',
814
        'REA(DU)---3^', 'R', None,
815
        'REBSERZEUG------', 'REBS ', 'REBZ ',
816
        'RECHERCH^', 'RESHASH', 'REZAZ',
817
        'RECYCL--', 'RIZEI', 'RIZEI',
818
        'RE(ALST)-3^', 'RE', None,
819
        'REE$', 'RI', 'RI',
820
        'RER$', 'RA', 'RA',
821
        'RE(MNR)-4', 'RE', 'RE',
822
        'RETTE$', 'RET', 'RET',
823
        'REUZ$', 'REUZ', None,
824
        'REW$', 'RU', 'RU',
825
        'RH<^', 'R', 'R',
826
        'RJA(MN)--', 'RI', 'RI',
827
        'ROWD-^', 'RAU', 'RAU',
828
        'RTEMONNAIE-', 'RTMON', 'RTNUN',
829
        'RTI(AÄOÖUÜ)-3', 'RZI', 'RZI',
830
        'RTIEL--3', 'RZI', 'RZI',
831
        'RV(AEOU)-3', 'RW', None,
832
        'RY(KN)-$', 'RI', 'RI',
833
        'RY9^', 'RÜ', None,
834
        'RÄFTEZEHRE-----', 'REFTE ', 'REFTE ',
835
        'SAISO-^', 'SES', 'ZEZ',
836
        'SAFE^$', 'SEIF', 'ZEIF',
837
        'SAUCE-^', 'SOS', 'ZUZ',
838
        'SCHLAGGEBEN-----<', 'SHLAK ', 'ZLAK ',
839
        'SCHSCH---7', '', '',
840
        'SCHTSCH', 'SH', 'Z',
841
        'SC(HZ)<', 'SH', 'Z',
842
        'SC', 'SK', 'ZK',
843
        'SELBSTST--7^^', 'SELB', 'ZELB',
844
        'SELBST7^^', 'SELBST', 'ZELBZT',
845
        'SERVICE7^', 'SÖRWIS', 'ZÖRFIZ',
846
        'SERVI-^', 'SERW', None,
847
        'SE(LMNRST)-3^', 'SE', 'ZE',
848
        'SETTE$', 'SET', 'ZET',
849
        'SHP-^', 'S', 'Z',
850
        'SHST', 'SHT', 'ZT',
851
        'SHTSH', 'SH', 'Z',
852
        'SHT', 'ST', 'Z',
853
        'SHY9^', 'SHÜ', None,
854
        'SH^^', 'SH', None,
855
        'SH3', 'SH', 'Z',
856
        'SICHERGEGAN-----^', 'SICHA ', 'ZIKA ',
857
        'SICHERGEHE----^', 'SICHA ', 'ZIKA ',
858
        'SICHERGESTEL------^', 'SICHA ', 'ZIKA ',
859
        'SICHERSTELL-----^', 'SICHA ', 'ZIKA ',
860
        'SICHERZU(GS)--^', 'SICHA ZU ', 'ZIKA ZU ',
861
        'SIEGLI-^', 'SIKL', 'ZIKL',
862
        'SIGLI-^', 'SIKL', 'ZIKL',
863
        'SIGHT', 'SEIT', 'ZEIT',
864
        'SIGN', 'SEIN', 'ZEIN',
865
        'SKI(NPZ)-', 'SKI', 'ZKI',
866
        'SKI<^', 'SHI', 'ZI',
867
        'SODASS^$', 'SO DAS', 'ZU TAZ',
868
        'SODAß^$', 'SO DAS', 'ZU TAZ',
869
        'SOGENAN--^', 'SO GEN', 'ZU KEN',
870
        'SOUND-', 'SAUN', 'ZAUN',
871
        'STAATS^^', 'STAZ', 'ZTAZ',
872
        'STADT^^', 'STAT', 'ZTAT',
873
        'STANDE$', ' STANDE', ' ZTANTE',
874
        'START^^', 'START', 'ZTART',
875
        'STAURANT7', 'STORAN', 'ZTURAN',
876
        'STEAK-', 'STE', 'ZTE',
877
        'STEPHEN-^$', 'STEW', None,
878
        'STERN', 'STERN', None,
879
        'STRAF^^', 'STRAF', 'ZTRAF',
880
        'ST\'S$', 'Z', 'Z',
881
        'ST´S$', 'Z', 'Z',
882
        'STST--', '', '',
883
        'STS(ACEÈÉÊHIÌÍÎOUÄÜÖ)--', 'ST', 'ZT',
884
        'ST(SZ)', 'Z', 'Z',
885
        'SPAREN---^', 'SPA', 'ZPA',
886
        'SPAREND----', ' SPA', ' ZPA',
887
        'S(PTW)-^^', 'S', None,
888
        'SP', 'SP', None,
889
        'STYN(AE)-$', 'STIN', 'ZTIN',
890
        'ST', 'ST', 'ZT',
891
        'SUITE<', 'SIUT', 'ZIUT',
892
        'SUKE--$', 'S', 'Z',
893
        'SURF(EI)-', 'SÖRF', 'ZÖRF',
894
        'SV(AEÈÉÊIÌÍÎOU)-<^', 'SW', None,
895
        'SYB(IY)--^', 'SIB', None,
896
        'SYL(KVW)--^', 'SI', None,
897
        'SY9^', 'SÜ', None,
898
        'SZE(NPT)-^', 'ZE', 'ZE',
899
        'SZI(ELN)-^', 'ZI', 'ZI',
900
        'SZCZ<', 'SH', 'Z',
901
        'SZT<', 'ST', 'ZT',
902
        'SZ<3', 'SH', 'Z',
903
        'SÜL(KVW)--^', 'SI', None,
904
        'S', None, 'Z',
905
        'TCH', 'SH', 'Z',
906
        'TD(AÄEIOÖRUÜY)-', 'T', None,
907
        'TD(ÀÁÂÃÅÈÉÊËÌÍÎÏÒÓÔÕØÙÚÛÝŸ)-', 'T', None,
908
        'TEAT-^', 'TEA', 'TEA',
909
        'TERRAI7^', 'TERA', 'TERA',
910
        'TE(LMNRST)-3^', 'TE', 'TE',
911
        'TH<', 'T', 'T',
912
        'TICHT-', 'TIK', 'TIK',
913
        'TICH$', 'TIK', 'TIK',
914
        'TIC$', 'TIZ', 'TIZ',
915
        'TIGGESTELL-------', 'TIK ', 'TIK ',
916
        'TIGSTELL-----', 'TIK ', 'TIK ',
917
        'TOAS-^', 'TO', 'TU',
918
        'TOILET-', 'TOLE', 'TULE',
919
        'TOIN-', 'TOA', 'TUA',
920
        'TRAECHTI-^', 'TRECHT', 'TREKT',
921
        'TRAECHTIG--', ' TRECHT', ' TREKT',
922
        'TRAINI-', 'TREN', 'TREN',
923
        'TRÄCHTI-^', 'TRECHT', 'TREKT',
924
        'TRÄCHTIG--', ' TRECHT', ' TREKT',
925
        'TSCH', 'SH', 'Z',
926
        'TSH', 'SH', 'Z',
927
        'TST', 'ZT', 'ZT',
928
        'T(Sß)', 'Z', 'Z',
929
        'TT(SZ)--<', '', '',
930
        'TT9', 'T', 'T',
931
        'TV^$', 'TV', 'TV',
932
        'TX(AEIOU)-3', 'SH', 'Z',
933
        'TY9^', 'TÜ', None,
934
        'TZ-', '', '',
935
        'T\'S3$', 'Z', 'Z',
936
        'T´S3$', 'Z', 'Z',
937
        'UEBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
938
        'UEBER^^', 'ÜBA', 'IBA',
939
        'UE2', 'Ü', 'I',
940
        'UGL-', 'UK', None,
941
        'UH(AOÖUÜY)-', 'UH', None,
942
        'UIE$', 'Ü', 'I',
943
        'UM^^', 'UM', 'UN',
944
        'UNTERE--3', 'UNTE', 'UNTE',
945
        'UNTER^^', 'UNTA', 'UNTA',
946
        'UNVER^^', 'UNFA', 'UNFA',
947
        'UN^^', 'UN', 'UN',
948
        'UTI(AÄOÖUÜ)-', 'UZI', 'UZI',
949
        'UVE-4', 'UW', None,
950
        'UY2', 'UI', None,
951
        'UZZ', 'AS', 'AZ',
952
        'VACL-^', 'WAZ', 'FAZ',
953
        'VAC$', 'WAZ', 'FAZ',
954
        'VAN DEN ^', 'FANDN', 'FANTN',
955
        'VANES-^', 'WANE', None,
956
        'VATRO-', 'WATR', None,
957
        'VA(DHJNT)--^', 'F', None,
958
        'VEDD-^', 'FE', 'FE',
959
        'VE(BEHIU)--^', 'F', None,
960
        'VEL(BDLMNT)-^', 'FEL', None,
961
        'VENTZ-^', 'FEN', None,
962
        'VEN(NRSZ)-^', 'FEN', None,
963
        'VER(AB)-^$', 'WER', None,
964
        'VERBAL^$', 'WERBAL', None,
965
        'VERBAL(EINS)-^', 'WERBAL', None,
966
        'VERTEBR--', 'WERTE', None,
967
        'VEREIN-----', 'F', None,
968
        'VEREN(AEIOU)-^', 'WEREN', None,
969
        'VERIFI', 'WERIFI', None,
970
        'VERON(AEIOU)-^', 'WERON', None,
971
        'VERSEN^', 'FERSN', 'FAZN',
972
        'VERSIERT--^', 'WERSI', None,
973
        'VERSIO--^', 'WERS', None,
974
        'VERSUS', 'WERSUS', None,
975
        'VERTI(GK)-', 'WERTI', None,
976
        'VER^^', 'FER', 'FA',
977
        'VERSPRECHE-------', ' FER', ' FA',
978
        'VER$', 'WA', None,
979
        'VER', 'FA', 'FA',
980
        'VET(HT)-^', 'FET', 'FET',
981
        'VETTE$', 'WET', 'FET',
982
        'VE^', 'WE', None,
983
        'VIC$', 'WIZ', 'FIZ',
984
        'VIELSAGE----', 'FIL ', 'FIL ',
985
        'VIEL', 'FIL', 'FIL',
986
        'VIEW', 'WIU', 'FIU',
987
        'VILL(AE)-', 'WIL', None,
988
        'VIS(ACEIKUVWZ)-<^', 'WIS', None,
989
        'VI(ELS)--^', 'F', None,
990
        'VILLON--', 'WILI', 'FILI',
991
        'VIZE^^', 'FIZE', 'FIZE',
992
        'VLIE--^', 'FL', None,
993
        'VL(AEIOU)--', 'W', None,
994
        'VOKA-^', 'WOK', None,
995
        'VOL(ATUVW)--^', 'WO', None,
996
        'VOR^^', 'FOR', 'FUR',
997
        'VR(AEIOU)--', 'W', None,
998
        'VV9', 'W', None,
999
        'VY9^', 'WÜ', 'FI',
1000
        'V(ÜY)-', 'W', None,
1001
        'V(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'W', None,
1002
        'V(AEIJLRU)-<', 'W', None,
1003
        'V.^', 'V.', None,
1004
        'V<', 'F', 'F',
1005
        'WEITERENTWI-----^', 'WEITA ', 'FEITA ',
1006
        'WEITREICH-----^', 'WEIT ', 'FEIT ',
1007
        'WEITVER^', 'WEIT FER', 'FEIT FA',
1008
        'WE(LMNRST)-3^', 'WE', 'FE',
1009
        'WER(DST)-', 'WER', None,
1010
        'WIC$', 'WIZ', 'FIZ',
1011
        'WIEDERU--', 'WIDE', 'FITE',
1012
        'WIEDER^$', 'WIDA', 'FITA',
1013
        'WIEDER^^', 'WIDA ', 'FITA ',
1014
        'WIEVIEL', 'WI FIL', 'FI FIL',
1015
        'WISUEL', 'WISUEL', None,
1016
        'WR-^', 'W', None,
1017
        'WY9^', 'WÜ', 'FI',
1018
        'W(BDFGJKLMNPQRSTZ)-', 'F', None,
1019
        'W$', 'F', None,
1020
        'W', None, 'F',
1021
        'X<^', 'Z', 'Z',
1022
        'XHAVEN$', 'XAFN', None,
1023
        'X(CSZ)', 'X', 'X',
1024
        'XTS(CH)--', 'XT', 'XT',
1025
        'XT(SZ)', 'Z', 'Z',
1026
        'YE(LMNRST)-3^', 'IE', 'IE',
1027
        'YE-3', 'I', 'I',
1028
        'YOR(GK)^$', 'IÖRK', 'IÖRK',
1029
        'Y(AOU)-<7', 'I', 'I',
1030
        'Y(BKLMNPRSTX)-1', 'Ü', None,
1031
        'YVES^$', 'IF', 'IF',
1032
        'YVONNE^$', 'IWON', 'IFUN',
1033
        'Y.^', 'Y.', None,
1034
        'Y', 'I', 'I',
1035
        'ZC(AOU)-', 'SK', 'ZK',
1036
        'ZE(LMNRST)-3^', 'ZE', 'ZE',
1037
        'ZIEJ$', 'ZI', 'ZI',
1038
        'ZIGERJA(HR)-3', 'ZIGA IA', 'ZIKA IA',
1039
        'ZL(AEIOU)-', 'SL', None,
1040
        'ZS(CHT)--', '', '',
1041
        'ZS', 'SH', 'Z',
1042
        'ZUERST', 'ZUERST', 'ZUERST',
1043
        'ZUGRUNDE^$', 'ZU GRUNDE', 'ZU KRUNTE',
1044
        'ZUGRUNDE', 'ZU GRUNDE ', 'ZU KRUNTE ',
1045
        'ZUGUNSTEN', 'ZU GUNSTN', 'ZU KUNZTN',
1046
        'ZUHAUSE-', 'ZU HAUS', 'ZU AUZ',
1047
        'ZULASTEN^$', 'ZU LASTN', 'ZU LAZTN',
1048
        'ZURUECK^^', 'ZURÜK', 'ZURIK',
1049
        'ZURZEIT', 'ZUR ZEIT', 'ZUR ZEIT',
1050
        'ZURÜCK^^', 'ZURÜK', 'ZURIK',
1051
        'ZUSTANDE', 'ZU STANDE', 'ZU ZTANTE',
1052
        'ZUTAGE', 'ZU TAGE', 'ZU TAKE',
1053
        'ZUVER^^', 'ZUFA', 'ZUFA',
1054
        'ZUVIEL', 'ZU FIL', 'ZU FIL',
1055
        'ZUWENIG', 'ZU WENIK', 'ZU FENIK',
1056
        'ZY9^', 'ZÜ', None,
1057
        'ZYK3$', 'ZIK', None,
1058
        'Z(VW)7^', 'SW', None,
1059
        None, None, None
1060
        # fmt: on
1061
    )
1062
1063 1
    _upper_trans = dict(
1064
        zip(
1065
            (
1066
                ord(_)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
1067
                for _ in 'abcdefghijklmnopqrstuvwxyzàáâãåäæ'
1068
                + 'çðèéêëìíîïñòóôõöøœšßþùúûüýÿ'
1069
            ),
1070
            'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÅÄÆ'
1071
            + 'ÇÐÈÉÊËÌÍÎÏÑÒÓÔÕÖØŒŠßÞÙÚÛÜÝŸ',
1072
        )
1073
    )
1074
1075 1
    def encode(self, word, mode=1, lang='de'):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'encode' method
Loading history...
1076
        """Return the phonet code for a word.
1077
1078
        Parameters
1079
        ----------
1080
        word : str
1081
            The word to transform
1082
        mode : int
1083
            The ponet variant to employ (1 or 2)
1084
        lang : str
1085
            ``de`` (default) for German, ``none`` for no language
1086
1087
        Returns
1088
        -------
1089
        str
1090
            The phonet value
1091
1092
        Examples
1093
        --------
1094
        >>> pe = Phonet()
1095
        >>> pe.encode('Christopher')
1096
        'KRISTOFA'
1097
        >>> pe.encode('Niall')
1098
        'NIAL'
1099
        >>> pe.encode('Smith')
1100
        'SMIT'
1101
        >>> pe.encode('Schmidt')
1102
        'SHMIT'
1103
1104
        >>> pe.encode('Christopher', mode=2)
1105
        'KRIZTUFA'
1106
        >>> pe.encode('Niall', mode=2)
1107
        'NIAL'
1108
        >>> pe.encode('Smith', mode=2)
1109
        'ZNIT'
1110
        >>> pe.encode('Schmidt', mode=2)
1111
        'ZNIT'
1112
1113
        >>> pe.encode('Christopher', lang='none')
1114
        'CHRISTOPHER'
1115
        >>> pe.encode('Niall', lang='none')
1116
        'NIAL'
1117
        >>> pe.encode('Smith', lang='none')
1118
        'SMITH'
1119
        >>> pe.encode('Schmidt', lang='none')
1120
        'SCHMIDT'
1121
1122
        """
1123 1
        phonet_hash = Counter()
1124 1
        alpha_pos = Counter()
1125
1126 1
        phonet_hash_1 = Counter()
1127 1
        phonet_hash_2 = Counter()
1128
1129 1
        def _initialize_phonet(lang):
1130
            """Initialize phonet variables.
1131
1132
            Parameters
1133
            ----------
1134
            lang : str
1135
                Language to use for rules
1136
1137
            """
1138 1
            if lang == 'none':
1139 1
                _phonet_rules = self._rules_no_lang
1140
            else:
1141 1
                _phonet_rules = self._rules_german
1142
1143 1
            phonet_hash[''] = -1
1144
1145
            # German and international umlauts
1146 1
            for j in {
1147
                'À',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1148
                'Á',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1149
                'Â',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1150
                'Ã',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1151
                'Ä',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1152
                'Å',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1153
                'Æ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1154
                'Ç',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1155
                'È',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1156
                'É',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1157
                'Ê',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1158
                'Ë',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1159
                'Ì',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1160
                'Í',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1161
                'Î',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1162
                'Ï',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1163
                'Ð',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1164
                'Ñ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1165
                'Ò',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1166
                'Ó',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1167
                'Ô',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1168
                'Õ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1169
                'Ö',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1170
                'Ø',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1171
                'Ù',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1172
                'Ú',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1173
                'Û',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1174
                'Ü',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1175
                'Ý',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1176
                'Þ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1177
                'ß',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1178
                'Œ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1179
                'Š',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1180
                'Ÿ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1181
            }:
1182 1
                alpha_pos[j] = 1
1183 1
                phonet_hash[j] = -1
1184
1185
            # "normal" letters ('A'-'Z')
1186 1
            for i, j in enumerate('ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
1187 1
                alpha_pos[j] = i + 2
1188 1
                phonet_hash[j] = -1
1189
1190 1
            for i in range(26):
1191 1
                for j in range(28):
1192 1
                    phonet_hash_1[i, j] = -1
1193 1
                    phonet_hash_2[i, j] = -1
1194
1195
            # for each phonetc rule
1196 1
            for i in range(len(_phonet_rules)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
1197 1
                rule = _phonet_rules[i]
1198
1199 1
                if rule and i % 3 == 0:
1200
                    # calculate first hash value
1201 1
                    k = _phonet_rules[i][0]
1202
1203 1
                    if phonet_hash[k] < 0 and (
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash does not seem to be defined.
Loading history...
1204
                        _phonet_rules[i + 1] or _phonet_rules[i + 2]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1205
                    ):
1206 1
                        phonet_hash[k] = i
1207
1208
                    # calculate second hash values
1209 1
                    if k and alpha_pos[k] >= 2:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable alpha_pos does not seem to be defined.
Loading history...
1210 1
                        k = alpha_pos[k]
1211
1212 1
                        j = k - 2
1213 1
                        rule = rule[1:]
1214
1215 1
                        if not rule:
1216 1
                            rule = ' '
1217 1
                        elif rule[0] == '(':
1218 1
                            rule = rule[1:]
1219
                        else:
1220 1
                            rule = rule[0]
1221
1222 1
                        while rule and (rule[0] != ')'):
1223 1
                            k = alpha_pos[rule[0]]
1224
1225 1
                            if k > 0:
1226
                                # add hash value for this letter
1227 1
                                if phonet_hash_1[j, k] < 0:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash_1 does not seem to be defined.
Loading history...
1228 1
                                    phonet_hash_1[j, k] = i
1229 1
                                    phonet_hash_2[j, k] = i
1230
1231 1
                                if phonet_hash_2[j, k] >= (i - 30):
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash_2 does not seem to be defined.
Loading history...
1232 1
                                    phonet_hash_2[j, k] = i
1233
                                else:
1234 1
                                    k = -1
1235
1236 1
                            if k <= 0:
1237
                                # add hash value for all letters
1238 1
                                if phonet_hash_1[j, 0] < 0:
1239 1
                                    phonet_hash_1[j, 0] = i
1240
1241 1
                                phonet_hash_2[j, 0] = i
1242
1243 1
                            rule = rule[1:]
1244
1245 1
        def _phonet(term, mode, lang):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (29/15).
Loading history...
1246
            """Return the phonet coded form of a term.
1247
1248
            Parameters
1249
            ----------
1250
            term : str
1251
                Term to transform
1252
            mode : int
1253
                The ponet variant to employ (1 or 2)
1254
            lang : str
1255
                ``de`` (default) for German, ``none`` for no language
1256
1257
            Returns
1258
            -------
1259
            str
1260
                The phonet value
1261
1262
            """
1263 1
            if lang == 'none':
1264 1
                _phonet_rules = self._rules_no_lang
1265
            else:
1266 1
                _phonet_rules = self._rules_german
1267
1268 1
            char0 = ''
1269 1
            dest = term
1270
1271 1
            if not term:
1272 1
                return ''
1273
1274 1
            term_length = len(term)
1275
1276
            # convert input string to upper-case
1277 1
            src = term.translate(self._upper_trans)
1278
1279
            # check "src"
1280 1
            i = 0
1281 1
            j = 0
1282 1
            zeta = 0
1283
1284 1
            while i < len(src):
0 ignored issues
show
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
unused-code introduced by
Too many nested blocks (8/5)
Loading history...
unused-code introduced by
Too many nested blocks (9/5)
Loading history...
unused-code introduced by
Too many nested blocks (7/5)
Loading history...
1285 1
                char = src[i]
1286
1287 1
                pos = alpha_pos[char]
1288
1289 1
                if pos >= 2:
1290 1
                    xpos = pos - 2
1291
1292 1
                    if i + 1 == len(src):
1293 1
                        pos = alpha_pos['']
1294
                    else:
1295 1
                        pos = alpha_pos[src[i + 1]]
1296
1297 1
                    start1 = phonet_hash_1[xpos, pos]
1298 1
                    start2 = phonet_hash_1[xpos, 0]
1299 1
                    end1 = phonet_hash_2[xpos, pos]
1300 1
                    end2 = phonet_hash_2[xpos, 0]
1301
1302
                    # preserve rule priorities
1303 1
                    if (start2 >= 0) and ((start1 < 0) or (start2 < start1)):
1304 1
                        pos = start1
1305 1
                        start1 = start2
1306 1
                        start2 = pos
1307 1
                        pos = end1
1308 1
                        end1 = end2
1309 1
                        end2 = pos
1310
1311 1
                    if (end1 >= start2) and (start2 >= 0):
1312 1
                        if end2 > end1:
1313 1
                            end1 = end2
1314
1315 1
                        start2 = -1
1316 1
                        end2 = -1
1317
                else:
1318 1
                    pos = phonet_hash[char]
1319 1
                    start1 = pos
1320 1
                    end1 = 10000
1321 1
                    start2 = -1
1322 1
                    end2 = -1
1323
1324 1
                pos = start1
1325 1
                zeta0 = 0
1326
1327 1
                if pos >= 0:
1328
                    # check rules for this char
1329 1
                    while (_phonet_rules[pos] is None) or (
1330
                        _phonet_rules[pos][0] == char
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1331
                    ):
1332 1
                        if pos > end1:
1333 1
                            if start2 > 0:
1334 1
                                pos = start2
1335 1
                                start1 = start2
1336 1
                                start2 = -1
1337 1
                                end1 = end2
1338 1
                                end2 = -1
1339 1
                                continue
1340
1341 1
                            break
1342
1343 1
                        if (_phonet_rules[pos] is None) or (
1344
                            _phonet_rules[pos + mode] is None
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1345
                        ):
1346
                            # no conversion rule available
1347 1
                            pos += 3
1348 1
                            continue
1349
1350
                        # check whole string
1351 1
                        matches = 1  # number of matching letters
1352 1
                        priority = 5  # default priority
1353 1
                        rule = _phonet_rules[pos]
1354 1
                        rule = rule[1:]
1355
1356 1
                        while (
1357
                            rule
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1358
                            and (len(src) > (i + matches))
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1359
                            and (src[i + matches] == rule[0])
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1360
                            and not rule[0].isdigit()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1361
                            and (rule not in '(-<^$')
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1362
                        ):
1363 1
                            matches += 1
1364 1
                            rule = rule[1:]
1365
1366 1
                        if rule and (rule[0] == '('):
1367
                            # check an array of letters
1368 1
                            if (
1369
                                (len(src) > (i + matches))
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1370
                                and src[i + matches].isalpha()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1371
                                and (src[i + matches] in rule[1:])
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1372
                            ):
1373 1
                                matches += 1
1374
1375 1
                                while rule and rule[0] != ')':
1376 1
                                    rule = rule[1:]
1377
1378
                                # if rule[0] == ')':
1379 1
                                rule = rule[1:]
1380
1381 1
                        if rule:
1382 1
                            priority0 = ord(rule[0])
1383
                        else:
1384 1
                            priority0 = 0
1385
1386 1
                        matches0 = matches
1387
1388 1
                        while rule and rule[0] == '-' and matches > 1:
1389 1
                            matches -= 1
1390 1
                            rule = rule[1:]
1391
1392 1
                        if rule and rule[0] == '<':
1393 1
                            rule = rule[1:]
1394
1395 1
                        if rule and rule[0].isdigit():
1396
                            # read priority
1397 1
                            priority = int(rule[0])
1398 1
                            rule = rule[1:]
1399
1400 1
                        if rule and rule[0:2] == '^^':
1401 1
                            rule = rule[1:]
1402
1403 1
                        if (
1404
                            not rule
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
best-practice introduced by
Too many boolean expressions in if statement (12/5)
Loading history...
1405
                            or (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1406
                                (rule[0] == '^')
1407
                                and ((i == 0) or not src[i - 1].isalpha())
1408
                                and (
1409
                                    (rule[1:2] != '$')
1410
                                    or (
1411
                                        not (
1412
                                            src[
1413
                                                i + matches0 : i + matches0 + 1
1414
                                            ].isalpha()
1415
                                        )
1416
                                        and (
1417
                                            src[
1418
                                                i + matches0 : i + matches0 + 1
1419
                                            ]
1420
                                            != '.'
1421
                                        )
1422
                                    )
1423
                                )
1424
                            )
1425
                            or (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1426
                                (rule[0] == '$')
1427
                                and (i > 0)
1428
                                and src[i - 1].isalpha()
1429
                                and (
1430
                                    (
1431
                                        not src[
1432
                                            i + matches0 : i + matches0 + 1
1433
                                        ].isalpha()
1434
                                    )
1435
                                    and (
1436
                                        src[i + matches0 : i + matches0 + 1]
1437
                                        != '.'
1438
                                    )
1439
                                )
1440
                            )
1441
                        ):
1442
                            # look for continuation, if:
1443
                            # matches > 1 und NO '-' in first string */
1444 1
                            pos0 = -1
1445
1446 1
                            start3 = 0
1447 1
                            start4 = 0
1448 1
                            end3 = 0
1449 1
                            end4 = 0
1450
1451 1
                            if (
1452
                                (matches > 1)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1453
                                and src[i + matches : i + matches + 1]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1454
                                and (priority0 != ord('-'))
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1455
                            ):
1456 1
                                char0 = src[i + matches - 1]
1457 1
                                pos0 = alpha_pos[char0]
1458
1459 1
                                if pos0 >= 2 and src[i + matches]:
1460 1
                                    xpos = pos0 - 2
1461 1
                                    pos0 = alpha_pos[src[i + matches]]
1462 1
                                    start3 = phonet_hash_1[xpos, pos0]
1463 1
                                    start4 = phonet_hash_1[xpos, 0]
1464 1
                                    end3 = phonet_hash_2[xpos, pos0]
1465 1
                                    end4 = phonet_hash_2[xpos, 0]
1466
1467
                                    # preserve rule priorities
1468 1
                                    if (start4 >= 0) and (
1469
                                        (start3 < 0) or (start4 < start3)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1470
                                    ):
1471 1
                                        pos0 = start3
1472 1
                                        start3 = start4
1473 1
                                        start4 = pos0
1474 1
                                        pos0 = end3
1475 1
                                        end3 = end4
1476 1
                                        end4 = pos0
1477
1478 1
                                    if (end3 >= start4) and (start4 >= 0):
1479 1
                                        if end4 > end3:
1480 1
                                            end3 = end4
1481
1482 1
                                        start4 = -1
1483 1
                                        end4 = -1
1484
                                else:
1485 1
                                    pos0 = phonet_hash[char0]
1486 1
                                    start3 = pos0
1487 1
                                    end3 = 10000
1488 1
                                    start4 = -1
1489 1
                                    end4 = -1
1490
1491 1
                                pos0 = start3
1492
1493
                            # check continuation rules for src[i+matches]
1494 1
                            if pos0 >= 0:
1495 1
                                while (_phonet_rules[pos0] is None) or (
1496
                                    _phonet_rules[pos0][0] == char0
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
introduced by
Value '_phonet_rules[pos0]' is unsubscriptable
Loading history...
1497
                                ):
1498 1
                                    if pos0 > end3:
1499 1
                                        if start4 > 0:
1500 1
                                            pos0 = start4
1501 1
                                            start3 = start4
1502 1
                                            start4 = -1
1503 1
                                            end3 = end4
1504 1
                                            end4 = -1
1505 1
                                            continue
1506
1507 1
                                        priority0 = -1
1508
1509
                                        # important
1510 1
                                        break
1511
1512 1
                                    if (_phonet_rules[pos0] is None) or (
1513
                                        _phonet_rules[pos0 + mode] is None
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1514
                                    ):
1515
                                        # no conversion rule available
1516 1
                                        pos0 += 3
1517 1
                                        continue
1518
1519
                                    # check whole string
1520 1
                                    matches0 = matches
1521 1
                                    priority0 = 5
1522 1
                                    rule = _phonet_rules[pos0]
1523 1
                                    rule = rule[1:]
0 ignored issues
show
introduced by
Value 'rule' is unsubscriptable
Loading history...
1524
1525 1
                                    while (
1526
                                        rule
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1527
                                        and (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1528
                                            src[
1529
                                                i + matches0 : i + matches0 + 1
1530
                                            ]
1531
                                            == rule[0]
1532
                                        )
1533
                                        and (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1534
                                            not rule[0].isdigit()
1535
                                            or (rule in '(-<^$')
1536
                                        )
1537
                                    ):
1538 1
                                        matches0 += 1
1539 1
                                        rule = rule[1:]
1540
1541 1
                                    if rule and rule[0] == '(':
1542
                                        # check an array of letters
1543 1
                                        if src[
1544
                                            i + matches0 : i + matches0 + 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1545
                                        ].isalpha() and (
1546
                                            src[i + matches0] in rule[1:]
1547
                                        ):
1548 1
                                            matches0 += 1
1549
1550 1
                                            while rule and rule[0] != ')':
1551 1
                                                rule = rule[1:]
1552
1553
                                            # if rule[0] == ')':
1554 1
                                            rule = rule[1:]
1555
1556 1
                                    while rule and rule[0] == '-':
1557
                                        # "matches0" is NOT decremented
1558
                                        # because of
1559
                                        #    "if (matches0 == matches)"
1560 1
                                        rule = rule[1:]
1561
1562 1
                                    if rule and rule[0] == '<':
1563 1
                                        rule = rule[1:]
1564
1565 1
                                    if rule and rule[0].isdigit():
1566 1
                                        priority0 = int(rule[0])
1567 1
                                        rule = rule[1:]
1568
1569 1
                                    if (
1570
                                        not rule
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1571
                                        or
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1572
                                        # rule == '^' is not possible here
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1573
                                        (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1574
                                            (rule[0] == '$')
1575
                                            and not src[
1576
                                                i + matches0 : i + matches0 + 1
1577
                                            ].isalpha()
1578
                                            and (
1579
                                                src[
1580
                                                    i
1581
                                                    + matches0 : i
1582
                                                    + matches0
1583
                                                    + 1
1584
                                                ]
1585
                                                != '.'
1586
                                            )
1587
                                        )
1588
                                    ):
1589 1
                                        if matches0 == matches:
1590
                                            # this is only a partial string
1591 1
                                            pos0 += 3
1592 1
                                            continue
1593
1594 1
                                        if priority0 < priority:
1595
                                            # priority is too low
1596 1
                                            pos0 += 3
1597 1
                                            continue
1598
1599
                                        # continuation rule found
1600 1
                                        break
1601
1602 1
                                    pos0 += 3
1603
1604
                                # end of "while"
1605 1
                                if (priority0 >= priority) and (
1606
                                    (_phonet_rules[pos0] is not None)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1607
                                    and (_phonet_rules[pos0][0] == char0)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
introduced by
Value '_phonet_rules[pos0]' is unsubscriptable
Loading history...
1608
                                ):
1609
1610 1
                                    pos += 3
1611 1
                                    continue
1612
1613
                            # replace string
1614 1
                            if _phonet_rules[pos] and (
1615
                                '<' in _phonet_rules[pos][1:]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1616
                            ):
1617 1
                                priority0 = 1
1618
                            else:
1619 1
                                priority0 = 0
1620
1621 1
                            rule = _phonet_rules[pos + mode]
1622
1623 1
                            if (priority0 == 1) and (zeta == 0):
1624
                                # rule with '<' is applied
1625 1
                                if (
1626
                                    (j > 0)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1627
                                    and rule
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1628
                                    and (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1629
                                        (dest[j - 1] == char)
1630
                                        or (dest[j - 1] == rule[0])
1631
                                    )
1632
                                ):
1633 1
                                    j -= 1
1634
1635 1
                                zeta0 = 1
1636 1
                                zeta += 1
1637 1
                                matches0 = 0
1638
1639 1
                                while rule and src[i + matches0]:
1640 1
                                    src = (
1641
                                        src[0 : i + matches0]
1642
                                        + rule[0]
1643
                                        + src[i + matches0 + 1 :]
1644
                                    )
1645 1
                                    matches0 += 1
1646 1
                                    rule = rule[1:]
1647
1648 1
                                if matches0 < matches:
1649 1
                                    src = (
1650
                                        src[0 : i + matches0]
1651
                                        + src[i + matches :]
1652
                                    )
1653
1654 1
                                char = src[i]
1655
                            else:
1656 1
                                i = i + matches - 1
1657 1
                                zeta = 0
1658
1659 1
                                while len(rule) > 1:
1660 1
                                    if (j == 0) or (dest[j - 1] != rule[0]):
1661 1
                                        dest = (
1662
                                            dest[0:j]
1663
                                            + rule[0]
1664
                                            + dest[min(len(dest), j + 1) :]
1665
                                        )
1666 1
                                        j += 1
1667
1668 1
                                    rule = rule[1:]
1669
1670
                                # new "current char"
1671 1
                                if not rule:
1672 1
                                    rule = ''
1673 1
                                    char = ''
1674
                                else:
1675 1
                                    char = rule[0]
1676
1677 1
                                if (
1678
                                    _phonet_rules[pos]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1679
                                    and '^^' in _phonet_rules[pos][1:]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1680
                                ):
1681 1
                                    if char:
1682 1
                                        dest = (
1683
                                            dest[0:j]
1684
                                            + char
1685
                                            + dest[min(len(dest), j + 1) :]
1686
                                        )
1687 1
                                        j += 1
1688
1689 1
                                    src = src[i + 1 :]
1690 1
                                    i = 0
1691 1
                                    zeta0 = 1
1692
1693 1
                            break
1694
1695 1
                        pos += 3
1696
1697 1
                        if pos > end1 and start2 > 0:
1698 1
                            pos = start2
1699 1
                            start1 = start2
1700 1
                            end1 = end2
1701 1
                            start2 = -1
1702 1
                            end2 = -1
1703
1704 1
                if zeta0 == 0:
1705 1
                    if char and ((j == 0) or (dest[j - 1] != char)):
1706
                        # delete multiple letters only
1707 1
                        dest = (
1708
                            dest[0:j] + char + dest[min(j + 1, term_length) :]
1709
                        )
1710 1
                        j += 1
1711
1712 1
                    i += 1
1713 1
                    zeta = 0
1714
1715 1
            dest = dest[0:j]
1716
1717 1
            return dest
1718
1719 1
        _initialize_phonet(lang)
1720
1721 1
        word = unicode_normalize('NFKC', text_type(word))
1722 1
        return _phonet(word, mode, lang)
1723
1724
1725 1
def phonet(word, mode=1, lang='de'):
1726
    """Return the phonet code for a word.
1727
1728
    This is a wrapper for :py:meth:`Phonet.encode`.
1729
1730
    Parameters
1731
    ----------
1732
    word : str
1733
        The word to transform
1734
    mode : int
1735
        The ponet variant to employ (1 or 2)
1736
    lang : str
1737
        ``de`` (default) for German, ``none`` for no language
1738
1739
    Returns
1740
    -------
1741
    str
1742
        The phonet value
1743
1744
    Examples
1745
    --------
1746
    >>> phonet('Christopher')
1747
    'KRISTOFA'
1748
    >>> phonet('Niall')
1749
    'NIAL'
1750
    >>> phonet('Smith')
1751
    'SMIT'
1752
    >>> phonet('Schmidt')
1753
    'SHMIT'
1754
1755
    >>> phonet('Christopher', mode=2)
1756
    'KRIZTUFA'
1757
    >>> phonet('Niall', mode=2)
1758
    'NIAL'
1759
    >>> phonet('Smith', mode=2)
1760
    'ZNIT'
1761
    >>> phonet('Schmidt', mode=2)
1762
    'ZNIT'
1763
1764
    >>> phonet('Christopher', lang='none')
1765
    'CHRISTOPHER'
1766
    >>> phonet('Niall', lang='none')
1767
    'NIAL'
1768
    >>> phonet('Smith', lang='none')
1769
    'SMITH'
1770
    >>> phonet('Schmidt', lang='none')
1771
    'SCHMIDT'
1772
1773
    """
1774 1
    return Phonet().encode(word, mode, lang)
1775
1776
1777
if __name__ == '__main__':
1778
    import doctest
1779
1780
    doctest.testmod()
1781