Completed
Pull Request — master (#141)
by Chris
13:03
created

abydos.phonetic._phonet.phonet()   A

Complexity

Conditions 1

Size

Total Lines 43
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 43
ccs 2
cts 2
cp 1
rs 10
c 0
b 0
f 0
cc 1
nop 3
crap 1
1
# -*- coding: utf-8 -*-
0 ignored issues
show
coding-style introduced by
Too many lines in module (1754/1000)
Loading history...
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._phonet.
20
21
The phonetic._phonet module implements phonet algorithm (a.k.a. Hannoveraner
22
Phonetik), intended chiefly for German.
23
"""
24
25 1
from __future__ import unicode_literals
26
27 1
from collections import Counter
28 1
from unicodedata import normalize as unicode_normalize
29
30 1
from six import text_type
31 1
from six.moves import range
32
33 1
from ._phonetic import Phonetic
34
35 1
__all__ = ['Phonet', 'phonet']
36
37
38 1
class Phonet(Phonetic):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
39
    """Phonet code.
40
41
    phonet ("Hannoveraner Phonetik") was developed by Jörg Michael and
42
    documented in :cite:`Michael:1999`.
43
44
    This is a port of Jesper Zedlitz's code, which is licensed LGPL
45
    :cite:`Zedlitz:2015`.
46
47
    That is, in turn, based on Michael's C code, which is also licensed LGPL
48
    :cite:`Michael:2007`.
49
    """
50
51 1
    _rules_no_lang = (  # separator chars
52
        # fmt: off
53
        '´', ' ', ' ',
54
        '"', ' ', ' ',
55
        '`$', '', '',
56
        '\'', ' ', ' ',
57
        ',', ',', ',',
58
        ';', ',', ',',
59
        '-', ' ', ' ',
60
        ' ', ' ', ' ',
61
        '.', '.', '.',
62
        ':', '.', '.',
63
        # German umlauts
64
        'Ä', 'AE', 'AE',
65
        'Ö', 'OE', 'OE',
66
        'Ü', 'UE', 'UE',
67
        'ß', 'S', 'S',
68
        # international umlauts
69
        'À', 'A', 'A',
70
        'Á', 'A', 'A',
71
        'Â', 'A', 'A',
72
        'Ã', 'A', 'A',
73
        'Å', 'A', 'A',
74
        'Æ', 'AE', 'AE',
75
        'Ç', 'C', 'C',
76
        'Ð', 'DJ', 'DJ',
77
        'È', 'E', 'E',
78
        'É', 'E', 'E',
79
        'Ê', 'E', 'E',
80
        'Ë', 'E', 'E',
81
        'Ì', 'I', 'I',
82
        'Í', 'I', 'I',
83
        'Î', 'I', 'I',
84
        'Ï', 'I', 'I',
85
        'Ñ', 'NH', 'NH',
86
        'Ò', 'O', 'O',
87
        'Ó', 'O', 'O',
88
        'Ô', 'O', 'O',
89
        'Õ', 'O', 'O',
90
        'Œ', 'OE', 'OE',
91
        'Ø', 'OE', 'OE',
92
        'Š', 'SH', 'SH',
93
        'Þ', 'TH', 'TH',
94
        'Ù', 'U', 'U',
95
        'Ú', 'U', 'U',
96
        'Û', 'U', 'U',
97
        'Ý', 'Y', 'Y',
98
        'Ÿ', 'Y', 'Y',
99
        # 'normal' letters (A-Z)
100
        'MC^', 'MAC', 'MAC',
101
        'MC^', 'MAC', 'MAC',
102
        'M´^', 'MAC', 'MAC',
103
        'M\'^', 'MAC', 'MAC',
104
        'O´^', 'O', 'O',
105
        'O\'^', 'O', 'O',
106
        'VAN DEN ^', 'VANDEN', 'VANDEN',
107
        None, None, None
108
        # fmt: on
109
    )
110
111 1
    _rules_german = (  # separator chars
112
        # fmt: off
113
        '´', ' ', ' ',
114
        '"', ' ', ' ',
115
        '`$', '', '',
116
        '\'', ' ', ' ',
117
        ',', ' ', ' ',
118
        ';', ' ', ' ',
119
        '-', ' ', ' ',
120
        ' ', ' ', ' ',
121
        '.', '.', '.',
122
        ':', '.', '.',
123
        # German umlauts
124
        'ÄE', 'E', 'E',
125
        'ÄU<', 'EU', 'EU',
126
        'ÄV(AEOU)-<', 'EW', None,
127
        'Ä$', 'Ä', None,
128
        'Ä<', None, 'E',
129
        'Ä', 'E', None,
130
        'ÖE', 'Ö', 'Ö',
131
        'ÖU', 'Ö', 'Ö',
132
        'ÖVER--<', 'ÖW', None,
133
        'ÖV(AOU)-', 'ÖW', None,
134
        'ÜBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
135
        'ÜBER^^', 'ÜBA', 'IBA',
136
        'ÜE', 'Ü', 'I',
137
        'ÜVER--<', 'ÜW', None,
138
        'ÜV(AOU)-', 'ÜW', None,
139
        'Ü', None, 'I',
140
        'ßCH<', None, 'Z',
141
        'ß<', 'S', 'Z',
142
        # international umlauts
143
        'À<', 'A', 'A',
144
        'Á<', 'A', 'A',
145
        'Â<', 'A', 'A',
146
        'Ã<', 'A', 'A',
147
        'Å<', 'A', 'A',
148
        'ÆER-', 'E', 'E',
149
        'ÆU<', 'EU', 'EU',
150
        'ÆV(AEOU)-<', 'EW', None,
151
        'Æ$', 'Ä', None,
152
        'Æ<', None, 'E',
153
        'Æ', 'E', None,
154
        'Ç', 'Z', 'Z',
155
        'ÐÐ-', '', '',
156
        'Ð', 'DI', 'TI',
157
        'È<', 'E', 'E',
158
        'É<', 'E', 'E',
159
        'Ê<', 'E', 'E',
160
        'Ë', 'E', 'E',
161
        'Ì<', 'I', 'I',
162
        'Í<', 'I', 'I',
163
        'Î<', 'I', 'I',
164
        'Ï', 'I', 'I',
165
        'ÑÑ-', '', '',
166
        'Ñ', 'NI', 'NI',
167
        'Ò<', 'O', 'U',
168
        'Ó<', 'O', 'U',
169
        'Ô<', 'O', 'U',
170
        'Õ<', 'O', 'U',
171
        'Œ<', 'Ö', 'Ö',
172
        'Ø(IJY)-<', 'E', 'E',
173
        'Ø<', 'Ö', 'Ö',
174
        'Š', 'SH', 'Z',
175
        'Þ', 'T', 'T',
176
        'Ù<', 'U', 'U',
177
        'Ú<', 'U', 'U',
178
        'Û<', 'U', 'U',
179
        'Ý<', 'I', 'I',
180
        'Ÿ<', 'I', 'I',
181
        # 'normal' letters (A-Z)
182
        'ABELLE$', 'ABL', 'ABL',
183
        'ABELL$', 'ABL', 'ABL',
184
        'ABIENNE$', 'ABIN', 'ABIN',
185
        'ACHME---^', 'ACH', 'AK',
186
        'ACEY$', 'AZI', 'AZI',
187
        'ADV', 'ATW', None,
188
        'AEGL-', 'EK', None,
189
        'AEU<', 'EU', 'EU',
190
        'AE2', 'E', 'E',
191
        'AFTRAUBEN------', 'AFT ', 'AFT ',
192
        'AGL-1', 'AK', None,
193
        'AGNI-^', 'AKN', 'AKN',
194
        'AGNIE-', 'ANI', 'ANI',
195
        'AGN(AEOU)-$', 'ANI', 'ANI',
196
        'AH(AIOÖUÜY)-', 'AH', None,
197
        'AIA2', 'AIA', 'AIA',
198
        'AIE$', 'E', 'E',
199
        'AILL(EOU)-', 'ALI', 'ALI',
200
        'AINE$', 'EN', 'EN',
201
        'AIRE$', 'ER', 'ER',
202
        'AIR-', 'E', 'E',
203
        'AISE$', 'ES', 'EZ',
204
        'AISSANCE$', 'ESANS', 'EZANZ',
205
        'AISSE$', 'ES', 'EZ',
206
        'AIX$', 'EX', 'EX',
207
        'AJ(AÄEÈÉÊIOÖUÜ)--', 'A', 'A',
208
        'AKTIE', 'AXIE', 'AXIE',
209
        'AKTUEL', 'AKTUEL', None,
210
        'ALOI^', 'ALOI', 'ALUI',  # Don't merge these rules
211
        'ALOY^', 'ALOI', 'ALUI',  # needed by 'check_rules'
212
        'AMATEU(RS)-', 'AMATÖ', 'ANATÖ',
213
        'ANCH(OEI)-', 'ANSH', 'ANZ',
214
        'ANDERGEGANG----', 'ANDA GE', 'ANTA KE',
215
        'ANDERGEHE----', 'ANDA ', 'ANTA ',
216
        'ANDERGESETZ----', 'ANDA GE', 'ANTA KE',
217
        'ANDERGING----', 'ANDA ', 'ANTA ',
218
        'ANDERSETZ(ET)-----', 'ANDA ', 'ANTA ',
219
        'ANDERZUGEHE----', 'ANDA ZU ', 'ANTA ZU ',
220
        'ANDERZUSETZE-----', 'ANDA ZU ', 'ANTA ZU ',
221
        'ANER(BKO)---^^', 'AN', None,
222
        'ANHAND---^$', 'AN H', 'AN ',
223
        'ANH(AÄEIOÖUÜY)--^^', 'AN', None,
224
        'ANIELLE$', 'ANIEL', 'ANIL',
225
        'ANIEL', 'ANIEL', None,
226
        'ANSTELLE----^$', 'AN ST', 'AN ZT',
227
        'ANTI^^', 'ANTI', 'ANTI',
228
        'ANVER^^', 'ANFA', 'ANFA',
229
        'ATIA$', 'ATIA', 'ATIA',
230
        'ATIA(NS)--', 'ATI', 'ATI',
231
        'ATI(AÄOÖUÜ)-', 'AZI', 'AZI',
232
        'AUAU--', '', '',
233
        'AUERE$', 'AUERE', None,
234
        'AUERE(NS)-$', 'AUERE', None,
235
        'AUERE(AIOUY)--', 'AUER', None,
236
        'AUER(AÄIOÖUÜY)-', 'AUER', None,
237
        'AUER<', 'AUA', 'AUA',
238
        'AUF^^', 'AUF', 'AUF',
239
        'AULT$', 'O', 'U',
240
        'AUR(BCDFGKLMNQSTVWZ)-', 'AUA', 'AUA',
241
        'AUR$', 'AUA', 'AUA',
242
        'AUSSE$', 'OS', 'UZ',
243
        'AUS(ST)-^', 'AUS', 'AUS',
244
        'AUS^^', 'AUS', 'AUS',
245
        'AUTOFAHR----', 'AUTO ', 'AUTU ',
246
        'AUTO^^', 'AUTO', 'AUTU',
247
        'AUX(IY)-', 'AUX', 'AUX',
248
        'AUX', 'O', 'U',
249
        'AU', 'AU', 'AU',
250
        'AVER--<', 'AW', None,
251
        'AVIER$', 'AWIE', 'AFIE',
252
        'AV(EÈÉÊI)-^', 'AW', None,
253
        'AV(AOU)-', 'AW', None,
254
        'AYRE$', 'EIRE', 'EIRE',
255
        'AYRE(NS)-$', 'EIRE', 'EIRE',
256
        'AYRE(AIOUY)--', 'EIR', 'EIR',
257
        'AYR(AÄIOÖUÜY)-', 'EIR', 'EIR',
258
        'AYR<', 'EIA', 'EIA',
259
        'AYER--<', 'EI', 'EI',
260
        'AY(AÄEIOÖUÜY)--', 'A', 'A',
261
        'AË', 'E', 'E',
262
        'A(IJY)<', 'EI', 'EI',
263
        'BABY^$', 'BEBI', 'BEBI',
264
        'BAB(IY)^', 'BEBI', 'BEBI',
265
        'BEAU^$', 'BO', None,
266
        'BEA(BCMNRU)-^', 'BEA', 'BEA',
267
        'BEAT(AEIMORU)-^', 'BEAT', 'BEAT',
268
        'BEE$', 'BI', 'BI',
269
        'BEIGE^$', 'BESH', 'BEZ',
270
        'BENOIT--', 'BENO', 'BENU',
271
        'BER(DT)-', 'BER', None,
272
        'BERN(DT)-', 'BERN', None,
273
        'BE(LMNRST)-^', 'BE', 'BE',
274
        'BETTE$', 'BET', 'BET',
275
        'BEVOR^$', 'BEFOR', None,
276
        'BIC$', 'BIZ', 'BIZ',
277
        'BOWL(EI)-', 'BOL', 'BUL',
278
        'BP(AÄEÈÉÊIÌÍÎOÖRUÜY)-', 'B', 'B',
279
        'BRINGEND-----^', 'BRI', 'BRI',
280
        'BRINGEND-----', ' BRI', ' BRI',
281
        'BROW(NS)-', 'BRAU', 'BRAU',
282
        'BUDGET7', 'BÜGE', 'BIKE',
283
        'BUFFET7', 'BÜFE', 'BIFE',
284
        'BYLLE$', 'BILE', 'BILE',
285
        'BYLL$', 'BIL', 'BIL',
286
        'BYPA--^', 'BEI', 'BEI',
287
        'BYTE<', 'BEIT', 'BEIT',
288
        'BY9^', 'BÜ', None,
289
        'B(SßZ)$', 'BS', None,
290
        'CACH(EI)-^', 'KESH', 'KEZ',
291
        'CAE--', 'Z', 'Z',
292
        'CA(IY)$', 'ZEI', 'ZEI',
293
        'CE(EIJUY)--', 'Z', 'Z',
294
        'CENT<', 'ZENT', 'ZENT',
295
        'CERST(EI)----^', 'KE', 'KE',
296
        'CER$', 'ZA', 'ZA',
297
        'CE3', 'ZE', 'ZE',
298
        'CH\'S$', 'X', 'X',
299
        'CH´S$', 'X', 'X',
300
        'CHAO(ST)-', 'KAO', 'KAU',
301
        'CHAMPIO-^', 'SHEMPI', 'ZENBI',
302
        'CHAR(AI)-^', 'KAR', 'KAR',
303
        'CHAU(CDFSVWXZ)-', 'SHO', 'ZU',
304
        'CHÄ(CF)-', 'SHE', 'ZE',
305
        'CHE(CF)-', 'SHE', 'ZE',
306
        'CHEM-^', 'KE', 'KE',  # or: 'CHE', 'KE'
307
        'CHEQUE<', 'SHEK', 'ZEK',
308
        'CHI(CFGPVW)-', 'SHI', 'ZI',
309
        'CH(AEUY)-<^', 'SH', 'Z',
310
        'CHK-', '', '',
311
        'CHO(CKPS)-^', 'SHO', 'ZU',
312
        'CHRIS-', 'KRI', None,
313
        'CHRO-', 'KR', None,
314
        'CH(LOR)-<^', 'K', 'K',
315
        'CHST-', 'X', 'X',
316
        'CH(SßXZ)3', 'X', 'X',
317
        'CHTNI-3', 'CHN', 'KN',
318
        'CH^', 'K', 'K',  # or: 'CH', 'K'
319
        'CH', 'CH', 'K',
320
        'CIC$', 'ZIZ', 'ZIZ',
321
        'CIENCEFICT----', 'EIENS ', 'EIENZ ',
322
        'CIENCE$', 'EIENS', 'EIENZ',
323
        'CIER$', 'ZIE', 'ZIE',
324
        'CYB-^', 'ZEI', 'ZEI',
325
        'CY9^', 'ZÜ', 'ZI',
326
        'C(IJY)-<3', 'Z', 'Z',
327
        'CLOWN-', 'KLAU', 'KLAU',
328
        'CCH', 'Z', 'Z',
329
        'CCE-', 'X', 'X',
330
        'C(CK)-', '', '',
331
        'CLAUDET---', 'KLO', 'KLU',
332
        'CLAUDINE^$', 'KLODIN', 'KLUTIN',
333
        'COACH', 'KOSH', 'KUZ',
334
        'COLE$', 'KOL', 'KUL',
335
        'COUCH', 'KAUSH', 'KAUZ',
336
        'COW', 'KAU', 'KAU',
337
        'CQUES$', 'K', 'K',
338
        'CQUE', 'K', 'K',
339
        'CRASH--9', 'KRE', 'KRE',
340
        'CREAT-^', 'KREA', 'KREA',
341
        'CST', 'XT', 'XT',
342
        'CS<^', 'Z', 'Z',
343
        'C(SßX)', 'X', 'X',
344
        'CT\'S$', 'X', 'X',
345
        'CT(SßXZ)', 'X', 'X',
346
        'CZ<', 'Z', 'Z',
347
        'C(ÈÉÊÌÍÎÝ)3', 'Z', 'Z',
348
        'C.^', 'C.', 'C.',
349
        'CÄ-', 'Z', 'Z',
350
        'CÜ$', 'ZÜ', 'ZI',
351
        'C\'S$', 'X', 'X',
352
        'C<', 'K', 'K',
353
        'DAHER^$', 'DAHER', None,
354
        'DARAUFFOLGE-----', 'DARAUF ', 'TARAUF ',
355
        'DAVO(NR)-^$', 'DAFO', 'TAFU',
356
        'DD(SZ)--<', '', '',
357
        'DD9', 'D', None,
358
        'DEPOT7', 'DEPO', 'TEBU',
359
        'DESIGN', 'DISEIN', 'TIZEIN',
360
        'DE(LMNRST)-3^', 'DE', 'TE',
361
        'DETTE$', 'DET', 'TET',
362
        'DH$', 'T', None,
363
        'DIC$', 'DIZ', 'TIZ',
364
        'DIDR-^', 'DIT', None,
365
        'DIEDR-^', 'DIT', None,
366
        'DJ(AEIOU)-^', 'I', 'I',
367
        'DMITR-^', 'DIMIT', 'TINIT',
368
        'DRY9^', 'DRÜ', None,
369
        'DT-', '', '',
370
        'DUIS-^', 'DÜ', 'TI',
371
        'DURCH^^', 'DURCH', 'TURK',
372
        'DVA$', 'TWA', None,
373
        'DY9^', 'DÜ', None,
374
        'DYS$', 'DIS', None,
375
        'DS(CH)--<', 'T', 'T',
376
        'DST', 'ZT', 'ZT',
377
        'DZS(CH)--', 'T', 'T',
378
        'D(SßZ)', 'Z', 'Z',
379
        'D(AÄEIOÖRUÜY)-', 'D', None,
380
        'D(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'D', None,
381
        'D\'H^', 'D', 'T',
382
        'D´H^', 'D', 'T',
383
        'D`H^', 'D', 'T',
384
        'D\'S3$', 'Z', 'Z',
385
        'D´S3$', 'Z', 'Z',
386
        'D^', 'D', None,
387
        'D', 'T', 'T',
388
        'EAULT$', 'O', 'U',
389
        'EAUX$', 'O', 'U',
390
        'EAU', 'O', 'U',
391
        'EAV', 'IW', 'IF',
392
        'EAS3$', 'EAS', None,
393
        'EA(AÄEIOÖÜY)-3', 'EA', 'EA',
394
        'EA3$', 'EA', 'EA',
395
        'EA3', 'I', 'I',
396
        'EBENSO^$', 'EBNSO', 'EBNZU',
397
        'EBENSO^^', 'EBNSO ', 'EBNZU ',
398
        'EBEN^^', 'EBN', 'EBN',
399
        'EE9', 'E', 'E',
400
        'EGL-1', 'EK', None,
401
        'EHE(IUY)--1', 'EH', None,
402
        'EHUNG---1', 'E', None,
403
        'EH(AÄIOÖUÜY)-1', 'EH', None,
404
        'EIEI--', '', '',
405
        'EIERE^$', 'EIERE', None,
406
        'EIERE$', 'EIERE', None,
407
        'EIERE(NS)-$', 'EIERE', None,
408
        'EIERE(AIOUY)--', 'EIER', None,
409
        'EIER(AÄIOÖUÜY)-', 'EIER', None,
410
        'EIER<', 'EIA', None,
411
        'EIGL-1', 'EIK', None,
412
        'EIGH$', 'EI', 'EI',
413
        'EIH--', 'E', 'E',
414
        'EILLE$', 'EI', 'EI',
415
        'EIR(BCDFGKLMNQSTVWZ)-', 'EIA', 'EIA',
416
        'EIR$', 'EIA', 'EIA',
417
        'EITRAUBEN------', 'EIT ', 'EIT ',
418
        'EI', 'EI', 'EI',
419
        'EJ$', 'EI', 'EI',
420
        'ELIZ^', 'ELIS', None,
421
        'ELZ^', 'ELS', None,
422
        'EL-^', 'E', 'E',
423
        'ELANG----1', 'E', 'E',
424
        'EL(DKL)--1', 'E', 'E',
425
        'EL(MNT)--1$', 'E', 'E',
426
        'ELYNE$', 'ELINE', 'ELINE',
427
        'ELYN$', 'ELIN', 'ELIN',
428
        'EL(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'EL', 'EL',
429
        'EL-1', 'L', 'L',
430
        'EM-^', None, 'E',
431
        'EM(DFKMPQT)--1', None, 'E',
432
        'EM(AÄEÈÉÊIÌÍÎOÖUÜY)--1', None, 'E',
433
        'EM-1', None, 'N',
434
        'ENGAG-^', 'ANGA', 'ANKA',
435
        'EN-^', 'E', 'E',
436
        'ENTUEL', 'ENTUEL', None,
437
        'EN(CDGKQSTZ)--1', 'E', 'E',
438
        'EN(AÄEÈÉÊIÌÍÎNOÖUÜY)-1', 'EN', 'EN',
439
        'EN-1', '', '',
440
        'ERH(AÄEIOÖUÜ)-^', 'ERH', 'ER',
441
        'ER-^', 'E', 'E',
442
        'ERREGEND-----', ' ER', ' ER',
443
        'ERT1$', 'AT', None,
444
        'ER(DGLKMNRQTZß)-1', 'ER', None,
445
        'ER(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'ER', 'A',
446
        'ER1$', 'A', 'A',
447
        'ER<1', 'A', 'A',
448
        'ETAT7', 'ETA', 'ETA',
449
        'ETI(AÄOÖÜU)-', 'EZI', 'EZI',
450
        'EUERE$', 'EUERE', None,
451
        'EUERE(NS)-$', 'EUERE', None,
452
        'EUERE(AIOUY)--', 'EUER', None,
453
        'EUER(AÄIOÖUÜY)-', 'EUER', None,
454
        'EUER<', 'EUA', None,
455
        'EUEU--', '', '',
456
        'EUILLE$', 'Ö', 'Ö',
457
        'EUR$', 'ÖR', 'ÖR',
458
        'EUX', 'Ö', 'Ö',
459
        'EUSZ$', 'EUS', None,
460
        'EUTZ$', 'EUS', None,
461
        'EUYS$', 'EUS', 'EUZ',
462
        'EUZ$', 'EUS', None,
463
        'EU', 'EU', 'EU',
464
        'EVER--<1', 'EW', None,
465
        'EV(ÄOÖUÜ)-1', 'EW', None,
466
        'EYER<', 'EIA', 'EIA',
467
        'EY<', 'EI', 'EI',
468
        'FACETTE', 'FASET', 'FAZET',
469
        'FANS--^$', 'FE', 'FE',
470
        'FAN-^$', 'FE', 'FE',
471
        'FAULT-', 'FOL', 'FUL',
472
        'FEE(DL)-', 'FI', 'FI',
473
        'FEHLER', 'FELA', 'FELA',
474
        'FE(LMNRST)-3^', 'FE', 'FE',
475
        'FOERDERN---^', 'FÖRD', 'FÖRT',
476
        'FOERDERN---', ' FÖRD', ' FÖRT',
477
        'FOND7', 'FON', 'FUN',
478
        'FRAIN$', 'FRA', 'FRA',
479
        'FRISEU(RS)-', 'FRISÖ', 'FRIZÖ',
480
        'FY9^', 'FÜ', None,
481
        'FÖRDERN---^', 'FÖRD', 'FÖRT',
482
        'FÖRDERN---', ' FÖRD', ' FÖRT',
483
        'GAGS^$', 'GEX', 'KEX',
484
        'GAG^$', 'GEK', 'KEK',
485
        'GD', 'KT', 'KT',
486
        'GEGEN^^', 'GEGN', 'KEKN',
487
        'GEGENGEKOM-----', 'GEGN ', 'KEKN ',
488
        'GEGENGESET-----', 'GEGN ', 'KEKN ',
489
        'GEGENKOMME-----', 'GEGN ', 'KEKN ',
490
        'GEGENZUKOM---', 'GEGN ZU ', 'KEKN ZU ',
491
        'GENDETWAS-----$', 'GENT ', 'KENT ',
492
        'GENRE', 'IORE', 'IURE',
493
        'GE(LMNRST)-3^', 'GE', 'KE',
494
        'GER(DKT)-', 'GER', None,
495
        'GETTE$', 'GET', 'KET',
496
        'GGF.', 'GF.', None,
497
        'GG-', '', '',
498
        'GH', 'G', None,
499
        'GI(AOU)-^', 'I', 'I',
500
        'GION-3', 'KIO', 'KIU',
501
        'G(CK)-', '', '',
502
        'GJ(AEIOU)-^', 'I', 'I',
503
        'GMBH^$', 'GMBH', 'GMBH',
504
        'GNAC$', 'NIAK', 'NIAK',
505
        'GNON$', 'NION', 'NIUN',
506
        'GN$', 'N', 'N',
507
        'GONCAL-^', 'GONZA', 'KUNZA',
508
        'GRY9^', 'GRÜ', None,
509
        'G(SßXZ)-<', 'K', 'K',
510
        'GUCK-', 'KU', 'KU',
511
        'GUISEP-^', 'IUSE', 'IUZE',
512
        'GUI-^', 'G', 'K',
513
        'GUTAUSSEH------^', 'GUT ', 'KUT ',
514
        'GUTGEHEND------^', 'GUT ', 'KUT ',
515
        'GY9^', 'GÜ', None,
516
        'G(AÄEILOÖRUÜY)-', 'G', None,
517
        'G(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'G', None,
518
        'G\'S$', 'X', 'X',
519
        'G´S$', 'X', 'X',
520
        'G^', 'G', None,
521
        'G', 'K', 'K',
522
        'HA(HIUY)--1', 'H', None,
523
        'HANDVOL---^', 'HANT ', 'ANT ',
524
        'HANNOVE-^', 'HANOF', None,
525
        'HAVEN7$', 'HAFN', None,
526
        'HEAD-', 'HE', 'E',
527
        'HELIEGEN------', 'E ', 'E ',
528
        'HESTEHEN------', 'E ', 'E ',
529
        'HE(LMNRST)-3^', 'HE', 'E',
530
        'HE(LMN)-1', 'E', 'E',
531
        'HEUR1$', 'ÖR', 'ÖR',
532
        'HE(HIUY)--1', 'H', None,
533
        'HIH(AÄEIOÖUÜY)-1', 'IH', None,
534
        'HLH(AÄEIOÖUÜY)-1', 'LH', None,
535
        'HMH(AÄEIOÖUÜY)-1', 'MH', None,
536
        'HNH(AÄEIOÖUÜY)-1', 'NH', None,
537
        'HOBBY9^', 'HOBI', None,
538
        'HOCHBEGAB-----^', 'HOCH ', 'UK ',
539
        'HOCHTALEN-----^', 'HOCH ', 'UK ',
540
        'HOCHZUFRI-----^', 'HOCH ', 'UK ',
541
        'HO(HIY)--1', 'H', None,
542
        'HRH(AÄEIOÖUÜY)-1', 'RH', None,
543
        'HUH(AÄEIOÖUÜY)-1', 'UH', None,
544
        'HUIS^^', 'HÜS', 'IZ',
545
        'HUIS$', 'ÜS', 'IZ',
546
        'HUI--1', 'H', None,
547
        'HYGIEN^', 'HÜKIEN', None,
548
        'HY9^', 'HÜ', None,
549
        'HY(BDGMNPST)-', 'Ü', None,
550
        'H.^', None, 'H.',
551
        'HÄU--1', 'H', None,
552
        'H^', 'H', '',
553
        'H', '', '',
554
        'ICHELL---', 'ISH', 'IZ',
555
        'ICHI$', 'ISHI', 'IZI',
556
        'IEC$', 'IZ', 'IZ',
557
        'IEDENSTELLE------', 'IDN ', 'ITN ',
558
        'IEI-3', '', '',
559
        'IELL3', 'IEL', 'IEL',
560
        'IENNE$', 'IN', 'IN',
561
        'IERRE$', 'IER', 'IER',
562
        'IERZULAN---', 'IR ZU ', 'IR ZU ',
563
        'IETTE$', 'IT', 'IT',
564
        'IEU', 'IÖ', 'IÖ',
565
        'IE<4', 'I', 'I',
566
        'IGL-1', 'IK', None,
567
        'IGHT3$', 'EIT', 'EIT',
568
        'IGNI(EO)-', 'INI', 'INI',
569
        'IGN(AEOU)-$', 'INI', 'INI',
570
        'IHER(DGLKRT)--1', 'IHE', None,
571
        'IHE(IUY)--', 'IH', None,
572
        'IH(AIOÖUÜY)-', 'IH', None,
573
        'IJ(AOU)-', 'I', 'I',
574
        'IJ$', 'I', 'I',
575
        'IJ<', 'EI', 'EI',
576
        'IKOLE$', 'IKOL', 'IKUL',
577
        'ILLAN(STZ)--4', 'ILIA', 'ILIA',
578
        'ILLAR(DT)--4', 'ILIA', 'ILIA',
579
        'IMSTAN----^', 'IM ', 'IN ',
580
        'INDELERREGE------', 'INDL ', 'INTL ',
581
        'INFRAGE-----^$', 'IN ', 'IN ',
582
        'INTERN(AOU)-^', 'INTAN', 'INTAN',
583
        'INVER-', 'INWE', 'INFE',
584
        'ITI(AÄIOÖUÜ)-', 'IZI', 'IZI',
585
        'IUSZ$', 'IUS', None,
586
        'IUTZ$', 'IUS', None,
587
        'IUZ$', 'IUS', None,
588
        'IVER--<', 'IW', None,
589
        'IVIER$', 'IWIE', 'IFIE',
590
        'IV(ÄOÖUÜ)-', 'IW', None,
591
        'IV<3', 'IW', None,
592
        'IY2', 'I', None,
593
        'I(ÈÉÊ)<4', 'I', 'I',
594
        'JAVIE---<^', 'ZA', 'ZA',
595
        'JEANS^$', 'JINS', 'INZ',
596
        'JEANNE^$', 'IAN', 'IAN',
597
        'JEAN-^', 'IA', 'IA',
598
        'JER-^', 'IE', 'IE',
599
        'JE(LMNST)-', 'IE', 'IE',
600
        'JI^', 'JI', None,
601
        'JOR(GK)^$', 'IÖRK', 'IÖRK',
602
        'J', 'I', 'I',
603
        'KC(ÄEIJ)-', 'X', 'X',
604
        'KD', 'KT', None,
605
        'KE(LMNRST)-3^', 'KE', 'KE',
606
        'KG(AÄEILOÖRUÜY)-', 'K', None,
607
        'KH<^', 'K', 'K',
608
        'KIC$', 'KIZ', 'KIZ',
609
        'KLE(LMNRST)-3^', 'KLE', 'KLE',
610
        'KOTELE-^', 'KOTL', 'KUTL',
611
        'KREAT-^', 'KREA', 'KREA',
612
        'KRÜS(TZ)--^', 'KRI', None,
613
        'KRYS(TZ)--^', 'KRI', None,
614
        'KRY9^', 'KRÜ', None,
615
        'KSCH---', 'K', 'K',
616
        'KSH--', 'K', 'K',
617
        'K(SßXZ)7', 'X', 'X',  # implies 'KST' -> 'XT'
618
        'KT\'S$', 'X', 'X',
619
        'KTI(AIOU)-3', 'XI', 'XI',
620
        'KT(SßXZ)', 'X', 'X',
621
        'KY9^', 'KÜ', None,
622
        'K\'S$', 'X', 'X',
623
        'K´S$', 'X', 'X',
624
        'LANGES$', ' LANGES', ' LANKEZ',
625
        'LANGE$', ' LANGE', ' LANKE',
626
        'LANG$', ' LANK', ' LANK',
627
        'LARVE-', 'LARF', 'LARF',
628
        'LD(SßZ)$', 'LS', 'LZ',
629
        'LD\'S$', 'LS', 'LZ',
630
        'LD´S$', 'LS', 'LZ',
631
        'LEAND-^', 'LEAN', 'LEAN',
632
        'LEERSTEHE-----^', 'LER ', 'LER ',
633
        'LEICHBLEIB-----', 'LEICH ', 'LEIK ',
634
        'LEICHLAUTE-----', 'LEICH ', 'LEIK ',
635
        'LEIDERREGE------', 'LEIT ', 'LEIT ',
636
        'LEIDGEPR----^', 'LEIT ', 'LEIT ',
637
        'LEINSTEHE-----', 'LEIN ', 'LEIN ',
638
        'LEL-', 'LE', 'LE',
639
        'LE(MNRST)-3^', 'LE', 'LE',
640
        'LETTE$', 'LET', 'LET',
641
        'LFGNAG-', 'LFGAN', 'LFKAN',
642
        'LICHERWEIS----', 'LICHA ', 'LIKA ',
643
        'LIC$', 'LIZ', 'LIZ',
644
        'LIVE^$', 'LEIF', 'LEIF',
645
        'LT(SßZ)$', 'LS', 'LZ',
646
        'LT\'S$', 'LS', 'LZ',
647
        'LT´S$', 'LS', 'LZ',
648
        'LUI(GS)--', 'LU', 'LU',
649
        'LV(AIO)-', 'LW', None,
650
        'LY9^', 'LÜ', None,
651
        'LSTS$', 'LS', 'LZ',
652
        'LZ(BDFGKLMNPQRSTVWX)-', 'LS', None,
653
        'L(SßZ)$', 'LS', None,
654
        'MAIR-<', 'MEI', 'NEI',
655
        'MANAG-', 'MENE', 'NENE',
656
        'MANUEL', 'MANUEL', None,
657
        'MASSEU(RS)-', 'MASÖ', 'NAZÖ',
658
        'MATCH', 'MESH', 'NEZ',
659
        'MAURICE', 'MORIS', 'NURIZ',
660
        'MBH^$', 'MBH', 'MBH',
661
        'MB(ßZ)$', 'MS', None,
662
        'MB(SßTZ)-', 'M', 'N',
663
        'MCG9^', 'MAK', 'NAK',
664
        'MC9^', 'MAK', 'NAK',
665
        'MEMOIR-^', 'MEMOA', 'NENUA',
666
        'MERHAVEN$', 'MAHAFN', None,
667
        'ME(LMNRST)-3^', 'ME', 'NE',
668
        'MEN(STZ)--3', 'ME', None,
669
        'MEN$', 'MEN', None,
670
        'MIGUEL-', 'MIGE', 'NIKE',
671
        'MIKE^$', 'MEIK', 'NEIK',
672
        'MITHILFE----^$', 'MIT H', 'NIT ',
673
        'MN$', 'M', None,
674
        'MN', 'N', 'N',
675
        'MPJUTE-', 'MPUT', 'NBUT',
676
        'MP(ßZ)$', 'MS', None,
677
        'MP(SßTZ)-', 'M', 'N',
678
        'MP(BDJLMNPQVW)-', 'MB', 'NB',
679
        'MY9^', 'MÜ', None,
680
        'M(ßZ)$', 'MS', None,
681
        'M´G7^', 'MAK', 'NAK',
682
        'M\'G7^', 'MAK', 'NAK',
683
        'M´^', 'MAK', 'NAK',
684
        'M\'^', 'MAK', 'NAK',
685
        'M', None, 'N',
686
        'NACH^^', 'NACH', 'NAK',
687
        'NADINE', 'NADIN', 'NATIN',
688
        'NAIV--', 'NA', 'NA',
689
        'NAISE$', 'NESE', 'NEZE',
690
        'NAUGENOMM------', 'NAU ', 'NAU ',
691
        'NAUSOGUT$', 'NAUSO GUT', 'NAUZU KUT',
692
        'NCH$', 'NSH', 'NZ',
693
        'NCOISE$', 'SOA', 'ZUA',
694
        'NCOIS$', 'SOA', 'ZUA',
695
        'NDAR$', 'NDA', 'NTA',
696
        'NDERINGEN------', 'NDE ', 'NTE ',
697
        'NDRO(CDKTZ)-', 'NTRO', None,
698
        'ND(BFGJLMNPQVW)-', 'NT', None,
699
        'ND(SßZ)$', 'NS', 'NZ',
700
        'ND\'S$', 'NS', 'NZ',
701
        'ND´S$', 'NS', 'NZ',
702
        'NEBEN^^', 'NEBN', 'NEBN',
703
        'NENGELERN------', 'NEN ', 'NEN ',
704
        'NENLERN(ET)---', 'NEN LE', 'NEN LE',
705
        'NENZULERNE---', 'NEN ZU LE', 'NEN ZU LE',
706
        'NE(LMNRST)-3^', 'NE', 'NE',
707
        'NEN-3', 'NE', 'NE',
708
        'NETTE$', 'NET', 'NET',
709
        'NGU^^', 'NU', 'NU',
710
        'NG(BDFJLMNPQRTVW)-', 'NK', 'NK',
711
        'NH(AUO)-$', 'NI', 'NI',
712
        'NICHTSAHNEN-----', 'NIX ', 'NIX ',
713
        'NICHTSSAGE----', 'NIX ', 'NIX ',
714
        'NICHTS^^', 'NIX', 'NIX',
715
        'NICHT^^', 'NICHT', 'NIKT',
716
        'NINE$', 'NIN', 'NIN',
717
        'NON^^', 'NON', 'NUN',
718
        'NOTLEIDE-----^', 'NOT ', 'NUT ',
719
        'NOT^^', 'NOT', 'NUT',
720
        'NTI(AIOU)-3', 'NZI', 'NZI',
721
        'NTIEL--3', 'NZI', 'NZI',
722
        'NT(SßZ)$', 'NS', 'NZ',
723
        'NT\'S$', 'NS', 'NZ',
724
        'NT´S$', 'NS', 'NZ',
725
        'NYLON', 'NEILON', 'NEILUN',
726
        'NY9^', 'NÜ', None,
727
        'NSTZUNEH---', 'NST ZU ', 'NZT ZU ',
728
        'NSZ-', 'NS', None,
729
        'NSTS$', 'NS', 'NZ',
730
        'NZ(BDFGKLMNPQRSTVWX)-', 'NS', None,
731
        'N(SßZ)$', 'NS', None,
732
        'OBERE-', 'OBER', None,
733
        'OBER^^', 'OBA', 'UBA',
734
        'OEU2', 'Ö', 'Ö',
735
        'OE<2', 'Ö', 'Ö',
736
        'OGL-', 'OK', None,
737
        'OGNIE-', 'ONI', 'UNI',
738
        'OGN(AEOU)-$', 'ONI', 'UNI',
739
        'OH(AIOÖUÜY)-', 'OH', None,
740
        'OIE$', 'Ö', 'Ö',
741
        'OIRE$', 'OA', 'UA',
742
        'OIR$', 'OA', 'UA',
743
        'OIX', 'OA', 'UA',
744
        'OI<3', 'EU', 'EU',
745
        'OKAY^$', 'OKE', 'UKE',
746
        'OLYN$', 'OLIN', 'ULIN',
747
        'OO(DLMZ)-', 'U', None,
748
        'OO$', 'U', None,
749
        'OO-', '', '',
750
        'ORGINAL-----', 'ORI', 'URI',
751
        'OTI(AÄOÖUÜ)-', 'OZI', 'UZI',
752
        'OUI^', 'WI', 'FI',
753
        'OUILLE$', 'ULIE', 'ULIE',
754
        'OU(DT)-^', 'AU', 'AU',
755
        'OUSE$', 'AUS', 'AUZ',
756
        'OUT-', 'AU', 'AU',
757
        'OU', 'U', 'U',
758
        'O(FV)$', 'AU', 'AU',  # due to 'OW$' -> 'AU'
759
        'OVER--<', 'OW', None,
760
        'OV(AOU)-', 'OW', None,
761
        'OW$', 'AU', 'AU',
762
        'OWS$', 'OS', 'UZ',
763
        'OJ(AÄEIOÖUÜ)--', 'O', 'U',
764
        'OYER', 'OIA', None,
765
        'OY(AÄEIOÖUÜ)--', 'O', 'U',
766
        'O(JY)<', 'EU', 'EU',
767
        'OZ$', 'OS', None,
768
        'O´^', 'O', 'U',
769
        'O\'^', 'O', 'U',
770
        'O', None, 'U',
771
        'PATIEN--^', 'PAZI', 'PAZI',
772
        'PENSIO-^', 'PANSI', 'PANZI',
773
        'PE(LMNRST)-3^', 'PE', 'PE',
774
        'PFER-^', 'FE', 'FE',
775
        'P(FH)<', 'F', 'F',
776
        'PIC^$', 'PIK', 'PIK',
777
        'PIC$', 'PIZ', 'PIZ',
778
        'PIPELINE', 'PEIBLEIN', 'PEIBLEIN',
779
        'POLYP-', 'POLÜ', None,
780
        'POLY^^', 'POLI', 'PULI',
781
        'PORTRAIT7', 'PORTRE', 'PURTRE',
782
        'POWER7', 'PAUA', 'PAUA',
783
        'PP(FH)--<', 'B', 'B',
784
        'PP-', '', '',
785
        'PRODUZ-^', 'PRODU', 'BRUTU',
786
        'PRODUZI--', ' PRODU', ' BRUTU',
787
        'PRIX^$', 'PRI', 'PRI',
788
        'PS-^^', 'P', None,
789
        'P(SßZ)^', None, 'Z',
790
        'P(SßZ)$', 'BS', None,
791
        'PT-^', '', '',
792
        'PTI(AÄOÖUÜ)-3', 'BZI', 'BZI',
793
        'PY9^', 'PÜ', None,
794
        'P(AÄEIOÖRUÜY)-', 'P', 'P',
795
        'P(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'P', None,
796
        'P.^', None, 'P.',
797
        'P^', 'P', None,
798
        'P', 'B', 'B',
799
        'QI-', 'Z', 'Z',
800
        'QUARANT--', 'KARA', 'KARA',
801
        'QUE(LMNRST)-3', 'KWE', 'KFE',
802
        'QUE$', 'K', 'K',
803
        'QUI(NS)$', 'KI', 'KI',
804
        'QUIZ7', 'KWIS', None,
805
        'Q(UV)7', 'KW', 'KF',
806
        'Q<', 'K', 'K',
807
        'RADFAHR----', 'RAT ', 'RAT ',
808
        'RAEFTEZEHRE-----', 'REFTE ', 'REFTE ',
809
        'RCH', 'RCH', 'RK',
810
        'REA(DU)---3^', 'R', None,
811
        'REBSERZEUG------', 'REBS ', 'REBZ ',
812
        'RECHERCH^', 'RESHASH', 'REZAZ',
813
        'RECYCL--', 'RIZEI', 'RIZEI',
814
        'RE(ALST)-3^', 'RE', None,
815
        'REE$', 'RI', 'RI',
816
        'RER$', 'RA', 'RA',
817
        'RE(MNR)-4', 'RE', 'RE',
818
        'RETTE$', 'RET', 'RET',
819
        'REUZ$', 'REUZ', None,
820
        'REW$', 'RU', 'RU',
821
        'RH<^', 'R', 'R',
822
        'RJA(MN)--', 'RI', 'RI',
823
        'ROWD-^', 'RAU', 'RAU',
824
        'RTEMONNAIE-', 'RTMON', 'RTNUN',
825
        'RTI(AÄOÖUÜ)-3', 'RZI', 'RZI',
826
        'RTIEL--3', 'RZI', 'RZI',
827
        'RV(AEOU)-3', 'RW', None,
828
        'RY(KN)-$', 'RI', 'RI',
829
        'RY9^', 'RÜ', None,
830
        'RÄFTEZEHRE-----', 'REFTE ', 'REFTE ',
831
        'SAISO-^', 'SES', 'ZEZ',
832
        'SAFE^$', 'SEIF', 'ZEIF',
833
        'SAUCE-^', 'SOS', 'ZUZ',
834
        'SCHLAGGEBEN-----<', 'SHLAK ', 'ZLAK ',
835
        'SCHSCH---7', '', '',
836
        'SCHTSCH', 'SH', 'Z',
837
        'SC(HZ)<', 'SH', 'Z',
838
        'SC', 'SK', 'ZK',
839
        'SELBSTST--7^^', 'SELB', 'ZELB',
840
        'SELBST7^^', 'SELBST', 'ZELBZT',
841
        'SERVICE7^', 'SÖRWIS', 'ZÖRFIZ',
842
        'SERVI-^', 'SERW', None,
843
        'SE(LMNRST)-3^', 'SE', 'ZE',
844
        'SETTE$', 'SET', 'ZET',
845
        'SHP-^', 'S', 'Z',
846
        'SHST', 'SHT', 'ZT',
847
        'SHTSH', 'SH', 'Z',
848
        'SHT', 'ST', 'Z',
849
        'SHY9^', 'SHÜ', None,
850
        'SH^^', 'SH', None,
851
        'SH3', 'SH', 'Z',
852
        'SICHERGEGAN-----^', 'SICHA ', 'ZIKA ',
853
        'SICHERGEHE----^', 'SICHA ', 'ZIKA ',
854
        'SICHERGESTEL------^', 'SICHA ', 'ZIKA ',
855
        'SICHERSTELL-----^', 'SICHA ', 'ZIKA ',
856
        'SICHERZU(GS)--^', 'SICHA ZU ', 'ZIKA ZU ',
857
        'SIEGLI-^', 'SIKL', 'ZIKL',
858
        'SIGLI-^', 'SIKL', 'ZIKL',
859
        'SIGHT', 'SEIT', 'ZEIT',
860
        'SIGN', 'SEIN', 'ZEIN',
861
        'SKI(NPZ)-', 'SKI', 'ZKI',
862
        'SKI<^', 'SHI', 'ZI',
863
        'SODASS^$', 'SO DAS', 'ZU TAZ',
864
        'SODAß^$', 'SO DAS', 'ZU TAZ',
865
        'SOGENAN--^', 'SO GEN', 'ZU KEN',
866
        'SOUND-', 'SAUN', 'ZAUN',
867
        'STAATS^^', 'STAZ', 'ZTAZ',
868
        'STADT^^', 'STAT', 'ZTAT',
869
        'STANDE$', ' STANDE', ' ZTANTE',
870
        'START^^', 'START', 'ZTART',
871
        'STAURANT7', 'STORAN', 'ZTURAN',
872
        'STEAK-', 'STE', 'ZTE',
873
        'STEPHEN-^$', 'STEW', None,
874
        'STERN', 'STERN', None,
875
        'STRAF^^', 'STRAF', 'ZTRAF',
876
        'ST\'S$', 'Z', 'Z',
877
        'ST´S$', 'Z', 'Z',
878
        'STST--', '', '',
879
        'STS(ACEÈÉÊHIÌÍÎOUÄÜÖ)--', 'ST', 'ZT',
880
        'ST(SZ)', 'Z', 'Z',
881
        'SPAREN---^', 'SPA', 'ZPA',
882
        'SPAREND----', ' SPA', ' ZPA',
883
        'S(PTW)-^^', 'S', None,
884
        'SP', 'SP', None,
885
        'STYN(AE)-$', 'STIN', 'ZTIN',
886
        'ST', 'ST', 'ZT',
887
        'SUITE<', 'SIUT', 'ZIUT',
888
        'SUKE--$', 'S', 'Z',
889
        'SURF(EI)-', 'SÖRF', 'ZÖRF',
890
        'SV(AEÈÉÊIÌÍÎOU)-<^', 'SW', None,
891
        'SYB(IY)--^', 'SIB', None,
892
        'SYL(KVW)--^', 'SI', None,
893
        'SY9^', 'SÜ', None,
894
        'SZE(NPT)-^', 'ZE', 'ZE',
895
        'SZI(ELN)-^', 'ZI', 'ZI',
896
        'SZCZ<', 'SH', 'Z',
897
        'SZT<', 'ST', 'ZT',
898
        'SZ<3', 'SH', 'Z',
899
        'SÜL(KVW)--^', 'SI', None,
900
        'S', None, 'Z',
901
        'TCH', 'SH', 'Z',
902
        'TD(AÄEIOÖRUÜY)-', 'T', None,
903
        'TD(ÀÁÂÃÅÈÉÊËÌÍÎÏÒÓÔÕØÙÚÛÝŸ)-', 'T', None,
904
        'TEAT-^', 'TEA', 'TEA',
905
        'TERRAI7^', 'TERA', 'TERA',
906
        'TE(LMNRST)-3^', 'TE', 'TE',
907
        'TH<', 'T', 'T',
908
        'TICHT-', 'TIK', 'TIK',
909
        'TICH$', 'TIK', 'TIK',
910
        'TIC$', 'TIZ', 'TIZ',
911
        'TIGGESTELL-------', 'TIK ', 'TIK ',
912
        'TIGSTELL-----', 'TIK ', 'TIK ',
913
        'TOAS-^', 'TO', 'TU',
914
        'TOILET-', 'TOLE', 'TULE',
915
        'TOIN-', 'TOA', 'TUA',
916
        'TRAECHTI-^', 'TRECHT', 'TREKT',
917
        'TRAECHTIG--', ' TRECHT', ' TREKT',
918
        'TRAINI-', 'TREN', 'TREN',
919
        'TRÄCHTI-^', 'TRECHT', 'TREKT',
920
        'TRÄCHTIG--', ' TRECHT', ' TREKT',
921
        'TSCH', 'SH', 'Z',
922
        'TSH', 'SH', 'Z',
923
        'TST', 'ZT', 'ZT',
924
        'T(Sß)', 'Z', 'Z',
925
        'TT(SZ)--<', '', '',
926
        'TT9', 'T', 'T',
927
        'TV^$', 'TV', 'TV',
928
        'TX(AEIOU)-3', 'SH', 'Z',
929
        'TY9^', 'TÜ', None,
930
        'TZ-', '', '',
931
        'T\'S3$', 'Z', 'Z',
932
        'T´S3$', 'Z', 'Z',
933
        'UEBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
934
        'UEBER^^', 'ÜBA', 'IBA',
935
        'UE2', 'Ü', 'I',
936
        'UGL-', 'UK', None,
937
        'UH(AOÖUÜY)-', 'UH', None,
938
        'UIE$', 'Ü', 'I',
939
        'UM^^', 'UM', 'UN',
940
        'UNTERE--3', 'UNTE', 'UNTE',
941
        'UNTER^^', 'UNTA', 'UNTA',
942
        'UNVER^^', 'UNFA', 'UNFA',
943
        'UN^^', 'UN', 'UN',
944
        'UTI(AÄOÖUÜ)-', 'UZI', 'UZI',
945
        'UVE-4', 'UW', None,
946
        'UY2', 'UI', None,
947
        'UZZ', 'AS', 'AZ',
948
        'VACL-^', 'WAZ', 'FAZ',
949
        'VAC$', 'WAZ', 'FAZ',
950
        'VAN DEN ^', 'FANDN', 'FANTN',
951
        'VANES-^', 'WANE', None,
952
        'VATRO-', 'WATR', None,
953
        'VA(DHJNT)--^', 'F', None,
954
        'VEDD-^', 'FE', 'FE',
955
        'VE(BEHIU)--^', 'F', None,
956
        'VEL(BDLMNT)-^', 'FEL', None,
957
        'VENTZ-^', 'FEN', None,
958
        'VEN(NRSZ)-^', 'FEN', None,
959
        'VER(AB)-^$', 'WER', None,
960
        'VERBAL^$', 'WERBAL', None,
961
        'VERBAL(EINS)-^', 'WERBAL', None,
962
        'VERTEBR--', 'WERTE', None,
963
        'VEREIN-----', 'F', None,
964
        'VEREN(AEIOU)-^', 'WEREN', None,
965
        'VERIFI', 'WERIFI', None,
966
        'VERON(AEIOU)-^', 'WERON', None,
967
        'VERSEN^', 'FERSN', 'FAZN',
968
        'VERSIERT--^', 'WERSI', None,
969
        'VERSIO--^', 'WERS', None,
970
        'VERSUS', 'WERSUS', None,
971
        'VERTI(GK)-', 'WERTI', None,
972
        'VER^^', 'FER', 'FA',
973
        'VERSPRECHE-------', ' FER', ' FA',
974
        'VER$', 'WA', None,
975
        'VER', 'FA', 'FA',
976
        'VET(HT)-^', 'FET', 'FET',
977
        'VETTE$', 'WET', 'FET',
978
        'VE^', 'WE', None,
979
        'VIC$', 'WIZ', 'FIZ',
980
        'VIELSAGE----', 'FIL ', 'FIL ',
981
        'VIEL', 'FIL', 'FIL',
982
        'VIEW', 'WIU', 'FIU',
983
        'VILL(AE)-', 'WIL', None,
984
        'VIS(ACEIKUVWZ)-<^', 'WIS', None,
985
        'VI(ELS)--^', 'F', None,
986
        'VILLON--', 'WILI', 'FILI',
987
        'VIZE^^', 'FIZE', 'FIZE',
988
        'VLIE--^', 'FL', None,
989
        'VL(AEIOU)--', 'W', None,
990
        'VOKA-^', 'WOK', None,
991
        'VOL(ATUVW)--^', 'WO', None,
992
        'VOR^^', 'FOR', 'FUR',
993
        'VR(AEIOU)--', 'W', None,
994
        'VV9', 'W', None,
995
        'VY9^', 'WÜ', 'FI',
996
        'V(ÜY)-', 'W', None,
997
        'V(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'W', None,
998
        'V(AEIJLRU)-<', 'W', None,
999
        'V.^', 'V.', None,
1000
        'V<', 'F', 'F',
1001
        'WEITERENTWI-----^', 'WEITA ', 'FEITA ',
1002
        'WEITREICH-----^', 'WEIT ', 'FEIT ',
1003
        'WEITVER^', 'WEIT FER', 'FEIT FA',
1004
        'WE(LMNRST)-3^', 'WE', 'FE',
1005
        'WER(DST)-', 'WER', None,
1006
        'WIC$', 'WIZ', 'FIZ',
1007
        'WIEDERU--', 'WIDE', 'FITE',
1008
        'WIEDER^$', 'WIDA', 'FITA',
1009
        'WIEDER^^', 'WIDA ', 'FITA ',
1010
        'WIEVIEL', 'WI FIL', 'FI FIL',
1011
        'WISUEL', 'WISUEL', None,
1012
        'WR-^', 'W', None,
1013
        'WY9^', 'WÜ', 'FI',
1014
        'W(BDFGJKLMNPQRSTZ)-', 'F', None,
1015
        'W$', 'F', None,
1016
        'W', None, 'F',
1017
        'X<^', 'Z', 'Z',
1018
        'XHAVEN$', 'XAFN', None,
1019
        'X(CSZ)', 'X', 'X',
1020
        'XTS(CH)--', 'XT', 'XT',
1021
        'XT(SZ)', 'Z', 'Z',
1022
        'YE(LMNRST)-3^', 'IE', 'IE',
1023
        'YE-3', 'I', 'I',
1024
        'YOR(GK)^$', 'IÖRK', 'IÖRK',
1025
        'Y(AOU)-<7', 'I', 'I',
1026
        'Y(BKLMNPRSTX)-1', 'Ü', None,
1027
        'YVES^$', 'IF', 'IF',
1028
        'YVONNE^$', 'IWON', 'IFUN',
1029
        'Y.^', 'Y.', None,
1030
        'Y', 'I', 'I',
1031
        'ZC(AOU)-', 'SK', 'ZK',
1032
        'ZE(LMNRST)-3^', 'ZE', 'ZE',
1033
        'ZIEJ$', 'ZI', 'ZI',
1034
        'ZIGERJA(HR)-3', 'ZIGA IA', 'ZIKA IA',
1035
        'ZL(AEIOU)-', 'SL', None,
1036
        'ZS(CHT)--', '', '',
1037
        'ZS', 'SH', 'Z',
1038
        'ZUERST', 'ZUERST', 'ZUERST',
1039
        'ZUGRUNDE^$', 'ZU GRUNDE', 'ZU KRUNTE',
1040
        'ZUGRUNDE', 'ZU GRUNDE ', 'ZU KRUNTE ',
1041
        'ZUGUNSTEN', 'ZU GUNSTN', 'ZU KUNZTN',
1042
        'ZUHAUSE-', 'ZU HAUS', 'ZU AUZ',
1043
        'ZULASTEN^$', 'ZU LASTN', 'ZU LAZTN',
1044
        'ZURUECK^^', 'ZURÜK', 'ZURIK',
1045
        'ZURZEIT', 'ZUR ZEIT', 'ZUR ZEIT',
1046
        'ZURÜCK^^', 'ZURÜK', 'ZURIK',
1047
        'ZUSTANDE', 'ZU STANDE', 'ZU ZTANTE',
1048
        'ZUTAGE', 'ZU TAGE', 'ZU TAKE',
1049
        'ZUVER^^', 'ZUFA', 'ZUFA',
1050
        'ZUVIEL', 'ZU FIL', 'ZU FIL',
1051
        'ZUWENIG', 'ZU WENIK', 'ZU FENIK',
1052
        'ZY9^', 'ZÜ', None,
1053
        'ZYK3$', 'ZIK', None,
1054
        'Z(VW)7^', 'SW', None,
1055
        None, None, None
1056
        # fmt: on
1057
    )
1058
1059 1
    _upper_trans = dict(
1060
        zip(
1061
            (
1062
                ord(_)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
1063
                for _ in 'abcdefghijklmnopqrstuvwxyzàáâãåäæ'
1064
                + 'çðèéêëìíîïñòóôõöøœšßþùúûüýÿ'
1065
            ),
1066
            'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÅÄÆ'
1067
            + 'ÇÐÈÉÊËÌÍÎÏÑÒÓÔÕÖØŒŠßÞÙÚÛÜÝŸ',
1068
        )
1069
    )
1070
1071 1
    def encode(self, word, mode=1, lang='de'):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'encode' method
Loading history...
1072
        """Return the phonet code for a word.
1073
1074
        Args:
1075
            word (str): The word to transform
1076
            mode (int): The ponet variant to employ (1 or 2)
1077
            lang (str): 'de' (default) for German, 'none' for no language
1078
1079
        Returns:
1080
            str: The phonet value
1081
1082
        Examples:
1083
            >>> pe = Phonet()
1084
            >>> pe.encode('Christopher')
1085
            'KRISTOFA'
1086
            >>> pe.encode('Niall')
1087
            'NIAL'
1088
            >>> pe.encode('Smith')
1089
            'SMIT'
1090
            >>> pe.encode('Schmidt')
1091
            'SHMIT'
1092
1093
            >>> pe.encode('Christopher', mode=2)
1094
            'KRIZTUFA'
1095
            >>> pe.encode('Niall', mode=2)
1096
            'NIAL'
1097
            >>> pe.encode('Smith', mode=2)
1098
            'ZNIT'
1099
            >>> pe.encode('Schmidt', mode=2)
1100
            'ZNIT'
1101
1102
            >>> pe.encode('Christopher', lang='none')
1103
            'CHRISTOPHER'
1104
            >>> pe.encode('Niall', lang='none')
1105
            'NIAL'
1106
            >>> pe.encode('Smith', lang='none')
1107
            'SMITH'
1108
            >>> pe.encode('Schmidt', lang='none')
1109
            'SCHMIDT'
1110
1111
        """
1112 1
        phonet_hash = Counter()
1113 1
        alpha_pos = Counter()
1114
1115 1
        phonet_hash_1 = Counter()
1116 1
        phonet_hash_2 = Counter()
1117
1118 1
        def _initialize_phonet(lang):
1119
            """Initialize phonet variables.
1120
1121
            Args:
1122
                lang (str): Language to use for rules
1123
1124
            """
1125 1
            if lang == 'none':
1126 1
                _phonet_rules = self._rules_no_lang
1127
            else:
1128 1
                _phonet_rules = self._rules_german
1129
1130 1
            phonet_hash[''] = -1
1131
1132
            # German and international umlauts
1133 1
            for j in {
1134
                'À',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1135
                'Á',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1136
                'Â',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1137
                'Ã',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1138
                'Ä',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1139
                'Å',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1140
                'Æ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1141
                'Ç',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1142
                'È',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1143
                'É',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1144
                'Ê',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1145
                'Ë',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1146
                'Ì',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1147
                'Í',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1148
                'Î',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1149
                'Ï',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1150
                'Ð',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1151
                'Ñ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1152
                'Ò',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1153
                'Ó',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1154
                'Ô',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1155
                'Õ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1156
                'Ö',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1157
                'Ø',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1158
                'Ù',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1159
                'Ú',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1160
                'Û',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1161
                'Ü',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1162
                'Ý',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1163
                'Þ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1164
                'ß',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1165
                'Œ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1166
                'Š',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1167
                'Ÿ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1168
            }:
1169 1
                alpha_pos[j] = 1
1170 1
                phonet_hash[j] = -1
1171
1172
            # "normal" letters ('A'-'Z')
1173 1
            for i, j in enumerate('ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
1174 1
                alpha_pos[j] = i + 2
1175 1
                phonet_hash[j] = -1
1176
1177 1
            for i in range(26):
1178 1
                for j in range(28):
1179 1
                    phonet_hash_1[i, j] = -1
1180 1
                    phonet_hash_2[i, j] = -1
1181
1182
            # for each phonetc rule
1183 1
            for i in range(len(_phonet_rules)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
1184 1
                rule = _phonet_rules[i]
1185
1186 1
                if rule and i % 3 == 0:
1187
                    # calculate first hash value
1188 1
                    k = _phonet_rules[i][0]
1189
1190 1
                    if phonet_hash[k] < 0 and (
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash does not seem to be defined.
Loading history...
1191
                        _phonet_rules[i + 1] or _phonet_rules[i + 2]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1192
                    ):
1193 1
                        phonet_hash[k] = i
1194
1195
                    # calculate second hash values
1196 1
                    if k and alpha_pos[k] >= 2:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable alpha_pos does not seem to be defined.
Loading history...
1197 1
                        k = alpha_pos[k]
1198
1199 1
                        j = k - 2
1200 1
                        rule = rule[1:]
1201
1202 1
                        if not rule:
1203 1
                            rule = ' '
1204 1
                        elif rule[0] == '(':
1205 1
                            rule = rule[1:]
1206
                        else:
1207 1
                            rule = rule[0]
1208
1209 1
                        while rule and (rule[0] != ')'):
1210 1
                            k = alpha_pos[rule[0]]
1211
1212 1
                            if k > 0:
1213
                                # add hash value for this letter
1214 1
                                if phonet_hash_1[j, k] < 0:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash_1 does not seem to be defined.
Loading history...
1215 1
                                    phonet_hash_1[j, k] = i
1216 1
                                    phonet_hash_2[j, k] = i
1217
1218 1
                                if phonet_hash_2[j, k] >= (i - 30):
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash_2 does not seem to be defined.
Loading history...
1219 1
                                    phonet_hash_2[j, k] = i
1220
                                else:
1221 1
                                    k = -1
1222
1223 1
                            if k <= 0:
1224
                                # add hash value for all letters
1225 1
                                if phonet_hash_1[j, 0] < 0:
1226 1
                                    phonet_hash_1[j, 0] = i
1227
1228 1
                                phonet_hash_2[j, 0] = i
1229
1230 1
                            rule = rule[1:]
1231
1232 1
        def _phonet(term, mode, lang):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (29/15).
Loading history...
1233
            """Return the phonet coded form of a term.
1234
1235
            Args:
1236
                term (str): Term to transform
1237
                mode (int): The ponet variant to employ (1 or 2)
1238
                lang (str): 'de' (default) for German, 'none' for no language
1239
1240
            Returns:
1241
                str: The phonet value
1242
1243
            """
1244 1
            if lang == 'none':
1245 1
                _phonet_rules = self._rules_no_lang
1246
            else:
1247 1
                _phonet_rules = self._rules_german
1248
1249 1
            char0 = ''
1250 1
            dest = term
1251
1252 1
            if not term:
1253 1
                return ''
1254
1255 1
            term_length = len(term)
1256
1257
            # convert input string to upper-case
1258 1
            src = term.translate(self._upper_trans)
1259
1260
            # check "src"
1261 1
            i = 0
1262 1
            j = 0
1263 1
            zeta = 0
1264
1265 1
            while i < len(src):
0 ignored issues
show
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
unused-code introduced by
Too many nested blocks (8/5)
Loading history...
unused-code introduced by
Too many nested blocks (9/5)
Loading history...
unused-code introduced by
Too many nested blocks (7/5)
Loading history...
1266 1
                char = src[i]
1267
1268 1
                pos = alpha_pos[char]
1269
1270 1
                if pos >= 2:
1271 1
                    xpos = pos - 2
1272
1273 1
                    if i + 1 == len(src):
1274 1
                        pos = alpha_pos['']
1275
                    else:
1276 1
                        pos = alpha_pos[src[i + 1]]
1277
1278 1
                    start1 = phonet_hash_1[xpos, pos]
1279 1
                    start2 = phonet_hash_1[xpos, 0]
1280 1
                    end1 = phonet_hash_2[xpos, pos]
1281 1
                    end2 = phonet_hash_2[xpos, 0]
1282
1283
                    # preserve rule priorities
1284 1
                    if (start2 >= 0) and ((start1 < 0) or (start2 < start1)):
1285 1
                        pos = start1
1286 1
                        start1 = start2
1287 1
                        start2 = pos
1288 1
                        pos = end1
1289 1
                        end1 = end2
1290 1
                        end2 = pos
1291
1292 1
                    if (end1 >= start2) and (start2 >= 0):
1293 1
                        if end2 > end1:
1294 1
                            end1 = end2
1295
1296 1
                        start2 = -1
1297 1
                        end2 = -1
1298
                else:
1299 1
                    pos = phonet_hash[char]
1300 1
                    start1 = pos
1301 1
                    end1 = 10000
1302 1
                    start2 = -1
1303 1
                    end2 = -1
1304
1305 1
                pos = start1
1306 1
                zeta0 = 0
1307
1308 1
                if pos >= 0:
1309
                    # check rules for this char
1310 1
                    while (_phonet_rules[pos] is None) or (
1311
                        _phonet_rules[pos][0] == char
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1312
                    ):
1313 1
                        if pos > end1:
1314 1
                            if start2 > 0:
1315 1
                                pos = start2
1316 1
                                start1 = start2
1317 1
                                start2 = -1
1318 1
                                end1 = end2
1319 1
                                end2 = -1
1320 1
                                continue
1321
1322 1
                            break
1323
1324 1
                        if (_phonet_rules[pos] is None) or (
1325
                            _phonet_rules[pos + mode] is None
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1326
                        ):
1327
                            # no conversion rule available
1328 1
                            pos += 3
1329 1
                            continue
1330
1331
                        # check whole string
1332 1
                        matches = 1  # number of matching letters
1333 1
                        priority = 5  # default priority
1334 1
                        rule = _phonet_rules[pos]
1335 1
                        rule = rule[1:]
1336
1337 1
                        while (
1338
                            rule
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1339
                            and (len(src) > (i + matches))
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1340
                            and (src[i + matches] == rule[0])
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1341
                            and not rule[0].isdigit()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1342
                            and (rule not in '(-<^$')
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1343
                        ):
1344 1
                            matches += 1
1345 1
                            rule = rule[1:]
1346
1347 1
                        if rule and (rule[0] == '('):
1348
                            # check an array of letters
1349 1
                            if (
1350
                                (len(src) > (i + matches))
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1351
                                and src[i + matches].isalpha()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1352
                                and (src[i + matches] in rule[1:])
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1353
                            ):
1354 1
                                matches += 1
1355
1356 1
                                while rule and rule[0] != ')':
1357 1
                                    rule = rule[1:]
1358
1359
                                # if rule[0] == ')':
1360 1
                                rule = rule[1:]
1361
1362 1
                        if rule:
1363 1
                            priority0 = ord(rule[0])
1364
                        else:
1365 1
                            priority0 = 0
1366
1367 1
                        matches0 = matches
1368
1369 1
                        while rule and rule[0] == '-' and matches > 1:
1370 1
                            matches -= 1
1371 1
                            rule = rule[1:]
1372
1373 1
                        if rule and rule[0] == '<':
1374 1
                            rule = rule[1:]
1375
1376 1
                        if rule and rule[0].isdigit():
1377
                            # read priority
1378 1
                            priority = int(rule[0])
1379 1
                            rule = rule[1:]
1380
1381 1
                        if rule and rule[0:2] == '^^':
1382 1
                            rule = rule[1:]
1383
1384 1
                        if (
1385
                            not rule
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
best-practice introduced by
Too many boolean expressions in if statement (12/5)
Loading history...
1386
                            or (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1387
                                (rule[0] == '^')
1388
                                and ((i == 0) or not src[i - 1].isalpha())
1389
                                and (
1390
                                    (rule[1:2] != '$')
1391
                                    or (
1392
                                        not (
1393
                                            src[
1394
                                                i + matches0 : i + matches0 + 1
1395
                                            ].isalpha()
1396
                                        )
1397
                                        and (
1398
                                            src[
1399
                                                i + matches0 : i + matches0 + 1
1400
                                            ]
1401
                                            != '.'
1402
                                        )
1403
                                    )
1404
                                )
1405
                            )
1406
                            or (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1407
                                (rule[0] == '$')
1408
                                and (i > 0)
1409
                                and src[i - 1].isalpha()
1410
                                and (
1411
                                    (
1412
                                        not src[
1413
                                            i + matches0 : i + matches0 + 1
1414
                                        ].isalpha()
1415
                                    )
1416
                                    and (
1417
                                        src[i + matches0 : i + matches0 + 1]
1418
                                        != '.'
1419
                                    )
1420
                                )
1421
                            )
1422
                        ):
1423
                            # look for continuation, if:
1424
                            # matches > 1 und NO '-' in first string */
1425 1
                            pos0 = -1
1426
1427 1
                            start3 = 0
1428 1
                            start4 = 0
1429 1
                            end3 = 0
1430 1
                            end4 = 0
1431
1432 1
                            if (
1433
                                (matches > 1)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1434
                                and src[i + matches : i + matches + 1]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1435
                                and (priority0 != ord('-'))
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1436
                            ):
1437 1
                                char0 = src[i + matches - 1]
1438 1
                                pos0 = alpha_pos[char0]
1439
1440 1
                                if pos0 >= 2 and src[i + matches]:
1441 1
                                    xpos = pos0 - 2
1442 1
                                    pos0 = alpha_pos[src[i + matches]]
1443 1
                                    start3 = phonet_hash_1[xpos, pos0]
1444 1
                                    start4 = phonet_hash_1[xpos, 0]
1445 1
                                    end3 = phonet_hash_2[xpos, pos0]
1446 1
                                    end4 = phonet_hash_2[xpos, 0]
1447
1448
                                    # preserve rule priorities
1449 1
                                    if (start4 >= 0) and (
1450
                                        (start3 < 0) or (start4 < start3)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1451
                                    ):
1452 1
                                        pos0 = start3
1453 1
                                        start3 = start4
1454 1
                                        start4 = pos0
1455 1
                                        pos0 = end3
1456 1
                                        end3 = end4
1457 1
                                        end4 = pos0
1458
1459 1
                                    if (end3 >= start4) and (start4 >= 0):
1460 1
                                        if end4 > end3:
1461 1
                                            end3 = end4
1462
1463 1
                                        start4 = -1
1464 1
                                        end4 = -1
1465
                                else:
1466 1
                                    pos0 = phonet_hash[char0]
1467 1
                                    start3 = pos0
1468 1
                                    end3 = 10000
1469 1
                                    start4 = -1
1470 1
                                    end4 = -1
1471
1472 1
                                pos0 = start3
1473
1474
                            # check continuation rules for src[i+matches]
1475 1
                            if pos0 >= 0:
1476 1
                                while (_phonet_rules[pos0] is None) or (
1477
                                    _phonet_rules[pos0][0] == char0
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
introduced by
Value '_phonet_rules[pos0]' is unsubscriptable
Loading history...
1478
                                ):
1479 1
                                    if pos0 > end3:
1480 1
                                        if start4 > 0:
1481 1
                                            pos0 = start4
1482 1
                                            start3 = start4
1483 1
                                            start4 = -1
1484 1
                                            end3 = end4
1485 1
                                            end4 = -1
1486 1
                                            continue
1487
1488 1
                                        priority0 = -1
1489
1490
                                        # important
1491 1
                                        break
1492
1493 1
                                    if (_phonet_rules[pos0] is None) or (
1494
                                        _phonet_rules[pos0 + mode] is None
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1495
                                    ):
1496
                                        # no conversion rule available
1497 1
                                        pos0 += 3
1498 1
                                        continue
1499
1500
                                    # check whole string
1501 1
                                    matches0 = matches
1502 1
                                    priority0 = 5
1503 1
                                    rule = _phonet_rules[pos0]
1504 1
                                    rule = rule[1:]
0 ignored issues
show
introduced by
Value 'rule' is unsubscriptable
Loading history...
1505
1506 1
                                    while (
1507
                                        rule
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1508
                                        and (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1509
                                            src[
1510
                                                i + matches0 : i + matches0 + 1
1511
                                            ]
1512
                                            == rule[0]
1513
                                        )
1514
                                        and (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1515
                                            not rule[0].isdigit()
1516
                                            or (rule in '(-<^$')
1517
                                        )
1518
                                    ):
1519 1
                                        matches0 += 1
1520 1
                                        rule = rule[1:]
1521
1522 1
                                    if rule and rule[0] == '(':
1523
                                        # check an array of letters
1524 1
                                        if src[
1525
                                            i + matches0 : i + matches0 + 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1526
                                        ].isalpha() and (
1527
                                            src[i + matches0] in rule[1:]
1528
                                        ):
1529 1
                                            matches0 += 1
1530
1531 1
                                            while rule and rule[0] != ')':
1532 1
                                                rule = rule[1:]
1533
1534
                                            # if rule[0] == ')':
1535 1
                                            rule = rule[1:]
1536
1537 1
                                    while rule and rule[0] == '-':
1538
                                        # "matches0" is NOT decremented
1539
                                        # because of
1540
                                        #    "if (matches0 == matches)"
1541 1
                                        rule = rule[1:]
1542
1543 1
                                    if rule and rule[0] == '<':
1544 1
                                        rule = rule[1:]
1545
1546 1
                                    if rule and rule[0].isdigit():
1547 1
                                        priority0 = int(rule[0])
1548 1
                                        rule = rule[1:]
1549
1550 1
                                    if (
1551
                                        not rule
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1552
                                        or
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1553
                                        # rule == '^' is not possible here
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1554
                                        (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1555
                                            (rule[0] == '$')
1556
                                            and not src[
1557
                                                i + matches0 : i + matches0 + 1
1558
                                            ].isalpha()
1559
                                            and (
1560
                                                src[
1561
                                                    i
1562
                                                    + matches0 : i
1563
                                                    + matches0
1564
                                                    + 1
1565
                                                ]
1566
                                                != '.'
1567
                                            )
1568
                                        )
1569
                                    ):
1570 1
                                        if matches0 == matches:
1571
                                            # this is only a partial string
1572 1
                                            pos0 += 3
1573 1
                                            continue
1574
1575 1
                                        if priority0 < priority:
1576
                                            # priority is too low
1577 1
                                            pos0 += 3
1578 1
                                            continue
1579
1580
                                        # continuation rule found
1581 1
                                        break
1582
1583 1
                                    pos0 += 3
1584
1585
                                # end of "while"
1586 1
                                if (priority0 >= priority) and (
1587
                                    (_phonet_rules[pos0] is not None)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1588
                                    and (_phonet_rules[pos0][0] == char0)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
introduced by
Value '_phonet_rules[pos0]' is unsubscriptable
Loading history...
1589
                                ):
1590
1591 1
                                    pos += 3
1592 1
                                    continue
1593
1594
                            # replace string
1595 1
                            if _phonet_rules[pos] and (
1596
                                '<' in _phonet_rules[pos][1:]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1597
                            ):
1598 1
                                priority0 = 1
1599
                            else:
1600 1
                                priority0 = 0
1601
1602 1
                            rule = _phonet_rules[pos + mode]
1603
1604 1
                            if (priority0 == 1) and (zeta == 0):
1605
                                # rule with '<' is applied
1606 1
                                if (
1607
                                    (j > 0)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1608
                                    and rule
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1609
                                    and (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1610
                                        (dest[j - 1] == char)
1611
                                        or (dest[j - 1] == rule[0])
1612
                                    )
1613
                                ):
1614 1
                                    j -= 1
1615
1616 1
                                zeta0 = 1
1617 1
                                zeta += 1
1618 1
                                matches0 = 0
1619
1620 1
                                while rule and src[i + matches0]:
1621 1
                                    src = (
1622
                                        src[0 : i + matches0]
1623
                                        + rule[0]
1624
                                        + src[i + matches0 + 1 :]
1625
                                    )
1626 1
                                    matches0 += 1
1627 1
                                    rule = rule[1:]
1628
1629 1
                                if matches0 < matches:
1630 1
                                    src = (
1631
                                        src[0 : i + matches0]
1632
                                        + src[i + matches :]
1633
                                    )
1634
1635 1
                                char = src[i]
1636
                            else:
1637 1
                                i = i + matches - 1
1638 1
                                zeta = 0
1639
1640 1
                                while len(rule) > 1:
1641 1
                                    if (j == 0) or (dest[j - 1] != rule[0]):
1642 1
                                        dest = (
1643
                                            dest[0:j]
1644
                                            + rule[0]
1645
                                            + dest[min(len(dest), j + 1) :]
1646
                                        )
1647 1
                                        j += 1
1648
1649 1
                                    rule = rule[1:]
1650
1651
                                # new "current char"
1652 1
                                if not rule:
1653 1
                                    rule = ''
1654 1
                                    char = ''
1655
                                else:
1656 1
                                    char = rule[0]
1657
1658 1
                                if (
1659
                                    _phonet_rules[pos]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1660
                                    and '^^' in _phonet_rules[pos][1:]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1661
                                ):
1662 1
                                    if char:
1663 1
                                        dest = (
1664
                                            dest[0:j]
1665
                                            + char
1666
                                            + dest[min(len(dest), j + 1) :]
1667
                                        )
1668 1
                                        j += 1
1669
1670 1
                                    src = src[i + 1 :]
1671 1
                                    i = 0
1672 1
                                    zeta0 = 1
1673
1674 1
                            break
1675
1676 1
                        pos += 3
1677
1678 1
                        if pos > end1 and start2 > 0:
1679 1
                            pos = start2
1680 1
                            start1 = start2
1681 1
                            end1 = end2
1682 1
                            start2 = -1
1683 1
                            end2 = -1
1684
1685 1
                if zeta0 == 0:
1686 1
                    if char and ((j == 0) or (dest[j - 1] != char)):
1687
                        # delete multiple letters only
1688 1
                        dest = (
1689
                            dest[0:j] + char + dest[min(j + 1, term_length) :]
1690
                        )
1691 1
                        j += 1
1692
1693 1
                    i += 1
1694 1
                    zeta = 0
1695
1696 1
            dest = dest[0:j]
1697
1698 1
            return dest
1699
1700 1
        _initialize_phonet(lang)
1701
1702 1
        word = unicode_normalize('NFKC', text_type(word))
1703 1
        return _phonet(word, mode, lang)
1704
1705
1706 1
def phonet(word, mode=1, lang='de'):
1707
    """Return the phonet code for a word.
1708
1709
    This is a wrapper for :py:meth:`Phonet.encode`.
1710
1711
    Args:
1712
        word (str): The word to transform
1713
        mode (int): The ponet variant to employ (1 or 2)
1714
        lang (str): 'de' (default) for German, 'none' for no language
1715
1716
    Returns:
1717
        str: The phonet value
1718
1719
    Examples:
1720
        >>> phonet('Christopher')
1721
        'KRISTOFA'
1722
        >>> phonet('Niall')
1723
        'NIAL'
1724
        >>> phonet('Smith')
1725
        'SMIT'
1726
        >>> phonet('Schmidt')
1727
        'SHMIT'
1728
1729
        >>> phonet('Christopher', mode=2)
1730
        'KRIZTUFA'
1731
        >>> phonet('Niall', mode=2)
1732
        'NIAL'
1733
        >>> phonet('Smith', mode=2)
1734
        'ZNIT'
1735
        >>> phonet('Schmidt', mode=2)
1736
        'ZNIT'
1737
1738
        >>> phonet('Christopher', lang='none')
1739
        'CHRISTOPHER'
1740
        >>> phonet('Niall', lang='none')
1741
        'NIAL'
1742
        >>> phonet('Smith', lang='none')
1743
        'SMITH'
1744
        >>> phonet('Schmidt', lang='none')
1745
        'SCHMIDT'
1746
1747
    """
1748 1
    return Phonet().encode(word, mode, lang)
1749
1750
1751
if __name__ == '__main__':
1752
    import doctest
1753
1754
    doctest.testmod()
1755