abydos.phonetic._phonet.Phonet.__init__()   A
last analyzed

Complexity

Conditions 1

Size

Total Lines 16
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 3
dl 0
loc 16
ccs 2
cts 2
cp 1
rs 10
c 0
b 0
f 0
cc 1
nop 3
crap 1
1
# Copyright 2014-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.phonetic._phonet.
18
19 1
phonet algorithm (a.k.a. Hannoveraner Phonetik), intended chiefly for German
20
"""
21
22
from collections import Counter
23
from typing import Counter as TCounter, Optional, Tuple, Union, cast
24 1
from unicodedata import normalize as unicode_normalize
25
26
from ._phonetic import _Phonetic
27
28
__all__ = ['Phonet']
29
30
31 1
class Phonet(_Phonetic):
32 1
    """Phonet code.
33
34 1
    phonet ("Hannoveraner Phonetik") was developed by Jörg Michael and
35
    documented in :cite:`Michael:1999`.
36 1
37 1
    This is a port of Jesper Zedlitz's code, which is licensed LGPL
38
    :cite:`Zedlitz:2015`.
39 1
40 1
    That is, in turn, based on Michael's C code, which is also licensed LGPL
41
    :cite:`Michael:2007`.
42 1
43
    .. versionadded:: 0.3.6
44
    """
45 1
46
    _rules_no_lang = (  # separator chars
47
        # fmt: off
48
        '´', ' ', ' ',
49
        '"', ' ', ' ',
50
        '`$', '', '',
51
        "'", ' ', ' ',
52
        ',', ',', ',',
53
        ';', ',', ',',
54
        '-', ' ', ' ',
55
        ' ', ' ', ' ',
56
        '.', '.', '.',
57
        ':', '.', '.',
58
        # German umlauts
59
        'Ä', 'AE', 'AE',
60 1
        'Ö', 'OE', 'OE',
61
        'Ü', 'UE', 'UE',
62
        'ß', 'S', 'S',
63
        # international umlauts
64
        'À', 'A', 'A',
65
        'Á', 'A', 'A',
66
        'Â', 'A', 'A',
67
        'Ã', 'A', 'A',
68
        'Å', 'A', 'A',
69
        'Æ', 'AE', 'AE',
70
        'Ç', 'C', 'C',
71
        'Ð', 'DJ', 'DJ',
72
        'È', 'E', 'E',
73
        'É', 'E', 'E',
74
        'Ê', 'E', 'E',
75
        'Ë', 'E', 'E',
76
        'Ì', 'I', 'I',
77
        'Í', 'I', 'I',
78
        'Î', 'I', 'I',
79
        'Ï', 'I', 'I',
80
        'Ñ', 'NH', 'NH',
81
        'Ò', 'O', 'O',
82
        'Ó', 'O', 'O',
83
        'Ô', 'O', 'O',
84
        'Õ', 'O', 'O',
85
        'Œ', 'OE', 'OE',
86
        'Ø', 'OE', 'OE',
87
        'Š', 'SH', 'SH',
88
        'Þ', 'TH', 'TH',
89
        'Ù', 'U', 'U',
90
        'Ú', 'U', 'U',
91
        'Û', 'U', 'U',
92
        'Ý', 'Y', 'Y',
93
        'Ÿ', 'Y', 'Y',
94
        # 'normal' letters (A-Z)
95
        'MC^', 'MAC', 'MAC',
96
        'MC^', 'MAC', 'MAC',
97
        'M´^', 'MAC', 'MAC',
98
        "M'^", 'MAC', 'MAC',
99
        'O´^', 'O', 'O',
100
        "O'^", 'O', 'O',
101
        'VAN DEN ^', 'VANDEN', 'VANDEN',
102
        None, None, None
103
        # fmt: on
104
    )  # type: Tuple[Union[str, None], ...]
105
106
    _rules_german = (  # separator chars
107
        # fmt: off
108
        '´', ' ', ' ',
109
        '"', ' ', ' ',
110
        '`$', '', '',
111
        "'", ' ', ' ',
112
        ',', ' ', ' ',
113
        ';', ' ', ' ',
114
        '-', ' ', ' ',
115
        ' ', ' ', ' ',
116
        '.', '.', '.',
117
        ':', '.', '.',
118
        # German umlauts
119
        'ÄE', 'E', 'E',
120 1
        'ÄU<', 'EU', 'EU',
121
        'ÄV(AEOU)-<', 'EW', None,
122
        'Ä$', 'Ä', None,
123
        'Ä<', None, 'E',
124
        'Ä', 'E', None,
125
        'ÖE', 'Ö', 'Ö',
126
        'ÖU', 'Ö', 'Ö',
127
        'ÖVER--<', 'ÖW', None,
128
        'ÖV(AOU)-', 'ÖW', None,
129
        'ÜBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
130
        'ÜBER^^', 'ÜBA', 'IBA',
131
        'ÜE', 'Ü', 'I',
132
        'ÜVER--<', 'ÜW', None,
133
        'ÜV(AOU)-', 'ÜW', None,
134
        'Ü', None, 'I',
135
        'ßCH<', None, 'Z',
136
        'ß<', 'S', 'Z',
137
        # international umlauts
138
        'À<', 'A', 'A',
139
        'Á<', 'A', 'A',
140
        'Â<', 'A', 'A',
141
        'Ã<', 'A', 'A',
142
        'Å<', 'A', 'A',
143
        'ÆER-', 'E', 'E',
144
        'ÆU<', 'EU', 'EU',
145
        'ÆV(AEOU)-<', 'EW', None,
146
        'Æ$', 'Ä', None,
147
        'Æ<', None, 'E',
148
        'Æ', 'E', None,
149
        'Ç', 'Z', 'Z',
150
        'ÐÐ-', '', '',
151
        'Ð', 'DI', 'TI',
152
        'È<', 'E', 'E',
153
        'É<', 'E', 'E',
154
        'Ê<', 'E', 'E',
155
        'Ë', 'E', 'E',
156
        'Ì<', 'I', 'I',
157
        'Í<', 'I', 'I',
158
        'Î<', 'I', 'I',
159
        'Ï', 'I', 'I',
160
        'ÑÑ-', '', '',
161
        'Ñ', 'NI', 'NI',
162
        'Ò<', 'O', 'U',
163
        'Ó<', 'O', 'U',
164
        'Ô<', 'O', 'U',
165
        'Õ<', 'O', 'U',
166
        'Œ<', 'Ö', 'Ö',
167
        'Ø(IJY)-<', 'E', 'E',
168
        'Ø<', 'Ö', 'Ö',
169
        'Š', 'SH', 'Z',
170
        'Þ', 'T', 'T',
171
        'Ù<', 'U', 'U',
172
        'Ú<', 'U', 'U',
173
        'Û<', 'U', 'U',
174
        'Ý<', 'I', 'I',
175
        'Ÿ<', 'I', 'I',
176
        # 'normal' letters (A-Z)
177
        'ABELLE$', 'ABL', 'ABL',
178
        'ABELL$', 'ABL', 'ABL',
179
        'ABIENNE$', 'ABIN', 'ABIN',
180
        'ACHME---^', 'ACH', 'AK',
181
        'ACEY$', 'AZI', 'AZI',
182
        'ADV', 'ATW', None,
183
        'AEGL-', 'EK', None,
184
        'AEU<', 'EU', 'EU',
185
        'AE2', 'E', 'E',
186
        'AFTRAUBEN------', 'AFT ', 'AFT ',
187
        'AGL-1', 'AK', None,
188
        'AGNI-^', 'AKN', 'AKN',
189
        'AGNIE-', 'ANI', 'ANI',
190
        'AGN(AEOU)-$', 'ANI', 'ANI',
191
        'AH(AIOÖUÜY)-', 'AH', None,
192
        'AIA2', 'AIA', 'AIA',
193
        'AIE$', 'E', 'E',
194
        'AILL(EOU)-', 'ALI', 'ALI',
195
        'AINE$', 'EN', 'EN',
196
        'AIRE$', 'ER', 'ER',
197
        'AIR-', 'E', 'E',
198
        'AISE$', 'ES', 'EZ',
199
        'AISSANCE$', 'ESANS', 'EZANZ',
200
        'AISSE$', 'ES', 'EZ',
201
        'AIX$', 'EX', 'EX',
202
        'AJ(AÄEÈÉÊIOÖUÜ)--', 'A', 'A',
203
        'AKTIE', 'AXIE', 'AXIE',
204
        'AKTUEL', 'AKTUEL', None,
205
        'ALOI^', 'ALOI', 'ALUI',  # Don't merge these rules
206
        'ALOY^', 'ALOI', 'ALUI',  # needed by 'check_rules'
207
        'AMATEU(RS)-', 'AMATÖ', 'ANATÖ',
208
        'ANCH(OEI)-', 'ANSH', 'ANZ',
209
        'ANDERGEGANG----', 'ANDA GE', 'ANTA KE',
210
        'ANDERGEHE----', 'ANDA ', 'ANTA ',
211
        'ANDERGESETZ----', 'ANDA GE', 'ANTA KE',
212
        'ANDERGING----', 'ANDA ', 'ANTA ',
213
        'ANDERSETZ(ET)-----', 'ANDA ', 'ANTA ',
214
        'ANDERZUGEHE----', 'ANDA ZU ', 'ANTA ZU ',
215
        'ANDERZUSETZE-----', 'ANDA ZU ', 'ANTA ZU ',
216
        'ANER(BKO)---^^', 'AN', None,
217
        'ANHAND---^$', 'AN H', 'AN ',
218
        'ANH(AÄEIOÖUÜY)--^^', 'AN', None,
219
        'ANIELLE$', 'ANIEL', 'ANIL',
220
        'ANIEL', 'ANIEL', None,
221
        'ANSTELLE----^$', 'AN ST', 'AN ZT',
222
        'ANTI^^', 'ANTI', 'ANTI',
223
        'ANVER^^', 'ANFA', 'ANFA',
224
        'ATIA$', 'ATIA', 'ATIA',
225
        'ATIA(NS)--', 'ATI', 'ATI',
226
        'ATI(AÄOÖUÜ)-', 'AZI', 'AZI',
227
        'AUAU--', '', '',
228
        'AUERE$', 'AUERE', None,
229
        'AUERE(NS)-$', 'AUERE', None,
230
        'AUERE(AIOUY)--', 'AUER', None,
231
        'AUER(AÄIOÖUÜY)-', 'AUER', None,
232
        'AUER<', 'AUA', 'AUA',
233
        'AUF^^', 'AUF', 'AUF',
234
        'AULT$', 'O', 'U',
235
        'AUR(BCDFGKLMNQSTVWZ)-', 'AUA', 'AUA',
236
        'AUR$', 'AUA', 'AUA',
237
        'AUSSE$', 'OS', 'UZ',
238
        'AUS(ST)-^', 'AUS', 'AUS',
239
        'AUS^^', 'AUS', 'AUS',
240
        'AUTOFAHR----', 'AUTO ', 'AUTU ',
241
        'AUTO^^', 'AUTO', 'AUTU',
242
        'AUX(IY)-', 'AUX', 'AUX',
243
        'AUX', 'O', 'U',
244
        'AU', 'AU', 'AU',
245
        'AVER--<', 'AW', None,
246
        'AVIER$', 'AWIE', 'AFIE',
247
        'AV(EÈÉÊI)-^', 'AW', None,
248
        'AV(AOU)-', 'AW', None,
249
        'AYRE$', 'EIRE', 'EIRE',
250
        'AYRE(NS)-$', 'EIRE', 'EIRE',
251
        'AYRE(AIOUY)--', 'EIR', 'EIR',
252
        'AYR(AÄIOÖUÜY)-', 'EIR', 'EIR',
253
        'AYR<', 'EIA', 'EIA',
254
        'AYER--<', 'EI', 'EI',
255
        'AY(AÄEIOÖUÜY)--', 'A', 'A',
256
        'AË', 'E', 'E',
257
        'A(IJY)<', 'EI', 'EI',
258
        'BABY^$', 'BEBI', 'BEBI',
259
        'BAB(IY)^', 'BEBI', 'BEBI',
260
        'BEAU^$', 'BO', None,
261
        'BEA(BCMNRU)-^', 'BEA', 'BEA',
262
        'BEAT(AEIMORU)-^', 'BEAT', 'BEAT',
263
        'BEE$', 'BI', 'BI',
264
        'BEIGE^$', 'BESH', 'BEZ',
265
        'BENOIT--', 'BENO', 'BENU',
266
        'BER(DT)-', 'BER', None,
267
        'BERN(DT)-', 'BERN', None,
268
        'BE(LMNRST)-^', 'BE', 'BE',
269
        'BETTE$', 'BET', 'BET',
270
        'BEVOR^$', 'BEFOR', None,
271
        'BIC$', 'BIZ', 'BIZ',
272
        'BOWL(EI)-', 'BOL', 'BUL',
273
        'BP(AÄEÈÉÊIÌÍÎOÖRUÜY)-', 'B', 'B',
274
        'BRINGEND-----^', 'BRI', 'BRI',
275
        'BRINGEND-----', ' BRI', ' BRI',
276
        'BROW(NS)-', 'BRAU', 'BRAU',
277
        'BUDGET7', 'BÜGE', 'BIKE',
278
        'BUFFET7', 'BÜFE', 'BIFE',
279
        'BYLLE$', 'BILE', 'BILE',
280
        'BYLL$', 'BIL', 'BIL',
281
        'BYPA--^', 'BEI', 'BEI',
282
        'BYTE<', 'BEIT', 'BEIT',
283
        'BY9^', 'BÜ', None,
284
        'B(SßZ)$', 'BS', None,
285
        'CACH(EI)-^', 'KESH', 'KEZ',
286
        'CAE--', 'Z', 'Z',
287
        'CA(IY)$', 'ZEI', 'ZEI',
288
        'CE(EIJUY)--', 'Z', 'Z',
289
        'CENT<', 'ZENT', 'ZENT',
290
        'CERST(EI)----^', 'KE', 'KE',
291
        'CER$', 'ZA', 'ZA',
292
        'CE3', 'ZE', 'ZE',
293
        "CH'S$", 'X', 'X',
294
        'CH´S$', 'X', 'X',
295
        'CHAO(ST)-', 'KAO', 'KAU',
296
        'CHAMPIO-^', 'SHEMPI', 'ZENBI',
297
        'CHAR(AI)-^', 'KAR', 'KAR',
298
        'CHAU(CDFSVWXZ)-', 'SHO', 'ZU',
299
        'CHÄ(CF)-', 'SHE', 'ZE',
300
        'CHE(CF)-', 'SHE', 'ZE',
301
        'CHEM-^', 'KE', 'KE',  # or: 'CHE', 'KE'
302
        'CHEQUE<', 'SHEK', 'ZEK',
303
        'CHI(CFGPVW)-', 'SHI', 'ZI',
304
        'CH(AEUY)-<^', 'SH', 'Z',
305
        'CHK-', '', '',
306
        'CHO(CKPS)-^', 'SHO', 'ZU',
307
        'CHRIS-', 'KRI', None,
308
        'CHRO-', 'KR', None,
309
        'CH(LOR)-<^', 'K', 'K',
310
        'CHST-', 'X', 'X',
311
        'CH(SßXZ)3', 'X', 'X',
312
        'CHTNI-3', 'CHN', 'KN',
313
        'CH^', 'K', 'K',  # or: 'CH', 'K'
314
        'CH', 'CH', 'K',
315
        'CIC$', 'ZIZ', 'ZIZ',
316
        'CIENCEFICT----', 'EIENS ', 'EIENZ ',
317
        'CIENCE$', 'EIENS', 'EIENZ',
318
        'CIER$', 'ZIE', 'ZIE',
319
        'CYB-^', 'ZEI', 'ZEI',
320
        'CY9^', 'ZÜ', 'ZI',
321
        'C(IJY)-<3', 'Z', 'Z',
322
        'CLOWN-', 'KLAU', 'KLAU',
323
        'CCH', 'Z', 'Z',
324
        'CCE-', 'X', 'X',
325
        'C(CK)-', '', '',
326
        'CLAUDET---', 'KLO', 'KLU',
327
        'CLAUDINE^$', 'KLODIN', 'KLUTIN',
328
        'COACH', 'KOSH', 'KUZ',
329
        'COLE$', 'KOL', 'KUL',
330
        'COUCH', 'KAUSH', 'KAUZ',
331
        'COW', 'KAU', 'KAU',
332
        'CQUES$', 'K', 'K',
333
        'CQUE', 'K', 'K',
334
        'CRASH--9', 'KRE', 'KRE',
335
        'CREAT-^', 'KREA', 'KREA',
336
        'CST', 'XT', 'XT',
337
        'CS<^', 'Z', 'Z',
338
        'C(SßX)', 'X', 'X',
339
        "CT'S$", 'X', 'X',
340
        'CT(SßXZ)', 'X', 'X',
341
        'CZ<', 'Z', 'Z',
342
        'C(ÈÉÊÌÍÎÝ)3', 'Z', 'Z',
343
        'C.^', 'C.', 'C.',
344
        'CÄ-', 'Z', 'Z',
345
        'CÜ$', 'ZÜ', 'ZI',
346
        "C'S$", 'X', 'X',
347
        'C<', 'K', 'K',
348
        'DAHER^$', 'DAHER', None,
349
        'DARAUFFOLGE-----', 'DARAUF ', 'TARAUF ',
350
        'DAVO(NR)-^$', 'DAFO', 'TAFU',
351
        'DD(SZ)--<', '', '',
352
        'DD9', 'D', None,
353
        'DEPOT7', 'DEPO', 'TEBU',
354
        'DESIGN', 'DISEIN', 'TIZEIN',
355
        'DE(LMNRST)-3^', 'DE', 'TE',
356
        'DETTE$', 'DET', 'TET',
357
        'DH$', 'T', None,
358
        'DIC$', 'DIZ', 'TIZ',
359
        'DIDR-^', 'DIT', None,
360
        'DIEDR-^', 'DIT', None,
361
        'DJ(AEIOU)-^', 'I', 'I',
362
        'DMITR-^', 'DIMIT', 'TINIT',
363
        'DRY9^', 'DRÜ', None,
364
        'DT-', '', '',
365
        'DUIS-^', 'DÜ', 'TI',
366
        'DURCH^^', 'DURCH', 'TURK',
367
        'DVA$', 'TWA', None,
368
        'DY9^', 'DÜ', None,
369
        'DYS$', 'DIS', None,
370
        'DS(CH)--<', 'T', 'T',
371
        'DST', 'ZT', 'ZT',
372
        'DZS(CH)--', 'T', 'T',
373
        'D(SßZ)', 'Z', 'Z',
374
        'D(AÄEIOÖRUÜY)-', 'D', None,
375
        'D(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'D', None,
376
        "D'H^", 'D', 'T',
377
        'D´H^', 'D', 'T',
378
        'D`H^', 'D', 'T',
379
        "D'S3$", 'Z', 'Z',
380
        'D´S3$', 'Z', 'Z',
381
        'D^', 'D', None,
382
        'D', 'T', 'T',
383
        'EAULT$', 'O', 'U',
384
        'EAUX$', 'O', 'U',
385
        'EAU', 'O', 'U',
386
        'EAV', 'IW', 'IF',
387
        'EAS3$', 'EAS', None,
388
        'EA(AÄEIOÖÜY)-3', 'EA', 'EA',
389
        'EA3$', 'EA', 'EA',
390
        'EA3', 'I', 'I',
391
        'EBENSO^$', 'EBNSO', 'EBNZU',
392
        'EBENSO^^', 'EBNSO ', 'EBNZU ',
393
        'EBEN^^', 'EBN', 'EBN',
394
        'EE9', 'E', 'E',
395
        'EGL-1', 'EK', None,
396
        'EHE(IUY)--1', 'EH', None,
397
        'EHUNG---1', 'E', None,
398
        'EH(AÄIOÖUÜY)-1', 'EH', None,
399
        'EIEI--', '', '',
400
        'EIERE^$', 'EIERE', None,
401
        'EIERE$', 'EIERE', None,
402
        'EIERE(NS)-$', 'EIERE', None,
403
        'EIERE(AIOUY)--', 'EIER', None,
404
        'EIER(AÄIOÖUÜY)-', 'EIER', None,
405
        'EIER<', 'EIA', None,
406
        'EIGL-1', 'EIK', None,
407
        'EIGH$', 'EI', 'EI',
408
        'EIH--', 'E', 'E',
409
        'EILLE$', 'EI', 'EI',
410
        'EIR(BCDFGKLMNQSTVWZ)-', 'EIA', 'EIA',
411
        'EIR$', 'EIA', 'EIA',
412
        'EITRAUBEN------', 'EIT ', 'EIT ',
413
        'EI', 'EI', 'EI',
414
        'EJ$', 'EI', 'EI',
415
        'ELIZ^', 'ELIS', None,
416
        'ELZ^', 'ELS', None,
417
        'EL-^', 'E', 'E',
418
        'ELANG----1', 'E', 'E',
419
        'EL(DKL)--1', 'E', 'E',
420
        'EL(MNT)--1$', 'E', 'E',
421
        'ELYNE$', 'ELINE', 'ELINE',
422
        'ELYN$', 'ELIN', 'ELIN',
423
        'EL(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'EL', 'EL',
424
        'EL-1', 'L', 'L',
425
        'EM-^', None, 'E',
426
        'EM(DFKMPQT)--1', None, 'E',
427
        'EM(AÄEÈÉÊIÌÍÎOÖUÜY)--1', None, 'E',
428
        'EM-1', None, 'N',
429
        'ENGAG-^', 'ANGA', 'ANKA',
430
        'EN-^', 'E', 'E',
431
        'ENTUEL', 'ENTUEL', None,
432
        'EN(CDGKQSTZ)--1', 'E', 'E',
433
        'EN(AÄEÈÉÊIÌÍÎNOÖUÜY)-1', 'EN', 'EN',
434
        'EN-1', '', '',
435
        'ERH(AÄEIOÖUÜ)-^', 'ERH', 'ER',
436
        'ER-^', 'E', 'E',
437
        'ERREGEND-----', ' ER', ' ER',
438
        'ERT1$', 'AT', None,
439
        'ER(DGLKMNRQTZß)-1', 'ER', None,
440
        'ER(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'ER', 'A',
441
        'ER1$', 'A', 'A',
442
        'ER<1', 'A', 'A',
443
        'ETAT7', 'ETA', 'ETA',
444
        'ETI(AÄOÖÜU)-', 'EZI', 'EZI',
445
        'EUERE$', 'EUERE', None,
446
        'EUERE(NS)-$', 'EUERE', None,
447
        'EUERE(AIOUY)--', 'EUER', None,
448
        'EUER(AÄIOÖUÜY)-', 'EUER', None,
449
        'EUER<', 'EUA', None,
450
        'EUEU--', '', '',
451
        'EUILLE$', 'Ö', 'Ö',
452
        'EUR$', 'ÖR', 'ÖR',
453
        'EUX', 'Ö', 'Ö',
454
        'EUSZ$', 'EUS', None,
455
        'EUTZ$', 'EUS', None,
456
        'EUYS$', 'EUS', 'EUZ',
457
        'EUZ$', 'EUS', None,
458
        'EU', 'EU', 'EU',
459
        'EVER--<1', 'EW', None,
460
        'EV(ÄOÖUÜ)-1', 'EW', None,
461
        'EYER<', 'EIA', 'EIA',
462
        'EY<', 'EI', 'EI',
463
        'FACETTE', 'FASET', 'FAZET',
464
        'FANS--^$', 'FE', 'FE',
465
        'FAN-^$', 'FE', 'FE',
466
        'FAULT-', 'FOL', 'FUL',
467
        'FEE(DL)-', 'FI', 'FI',
468
        'FEHLER', 'FELA', 'FELA',
469
        'FE(LMNRST)-3^', 'FE', 'FE',
470
        'FOERDERN---^', 'FÖRD', 'FÖRT',
471
        'FOERDERN---', ' FÖRD', ' FÖRT',
472
        'FOND7', 'FON', 'FUN',
473
        'FRAIN$', 'FRA', 'FRA',
474
        'FRISEU(RS)-', 'FRISÖ', 'FRIZÖ',
475
        'FY9^', 'FÜ', None,
476
        'FÖRDERN---^', 'FÖRD', 'FÖRT',
477
        'FÖRDERN---', ' FÖRD', ' FÖRT',
478
        'GAGS^$', 'GEX', 'KEX',
479
        'GAG^$', 'GEK', 'KEK',
480
        'GD', 'KT', 'KT',
481
        'GEGEN^^', 'GEGN', 'KEKN',
482
        'GEGENGEKOM-----', 'GEGN ', 'KEKN ',
483
        'GEGENGESET-----', 'GEGN ', 'KEKN ',
484
        'GEGENKOMME-----', 'GEGN ', 'KEKN ',
485
        'GEGENZUKOM---', 'GEGN ZU ', 'KEKN ZU ',
486
        'GENDETWAS-----$', 'GENT ', 'KENT ',
487
        'GENRE', 'IORE', 'IURE',
488
        'GE(LMNRST)-3^', 'GE', 'KE',
489
        'GER(DKT)-', 'GER', None,
490
        'GETTE$', 'GET', 'KET',
491
        'GGF.', 'GF.', None,
492
        'GG-', '', '',
493
        'GH', 'G', None,
494
        'GI(AOU)-^', 'I', 'I',
495
        'GION-3', 'KIO', 'KIU',
496
        'G(CK)-', '', '',
497
        'GJ(AEIOU)-^', 'I', 'I',
498
        'GMBH^$', 'GMBH', 'GMBH',
499
        'GNAC$', 'NIAK', 'NIAK',
500
        'GNON$', 'NION', 'NIUN',
501
        'GN$', 'N', 'N',
502
        'GONCAL-^', 'GONZA', 'KUNZA',
503
        'GRY9^', 'GRÜ', None,
504
        'G(SßXZ)-<', 'K', 'K',
505
        'GUCK-', 'KU', 'KU',
506
        'GUISEP-^', 'IUSE', 'IUZE',
507
        'GUI-^', 'G', 'K',
508
        'GUTAUSSEH------^', 'GUT ', 'KUT ',
509
        'GUTGEHEND------^', 'GUT ', 'KUT ',
510
        'GY9^', 'GÜ', None,
511
        'G(AÄEILOÖRUÜY)-', 'G', None,
512
        'G(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'G', None,
513
        "G'S$", 'X', 'X',
514
        'G´S$', 'X', 'X',
515
        'G^', 'G', None,
516
        'G', 'K', 'K',
517
        'HA(HIUY)--1', 'H', None,
518
        'HANDVOL---^', 'HANT ', 'ANT ',
519
        'HANNOVE-^', 'HANOF', None,
520
        'HAVEN7$', 'HAFN', None,
521
        'HEAD-', 'HE', 'E',
522
        'HELIEGEN------', 'E ', 'E ',
523
        'HESTEHEN------', 'E ', 'E ',
524
        'HE(LMNRST)-3^', 'HE', 'E',
525
        'HE(LMN)-1', 'E', 'E',
526
        'HEUR1$', 'ÖR', 'ÖR',
527
        'HE(HIUY)--1', 'H', None,
528
        'HIH(AÄEIOÖUÜY)-1', 'IH', None,
529
        'HLH(AÄEIOÖUÜY)-1', 'LH', None,
530
        'HMH(AÄEIOÖUÜY)-1', 'MH', None,
531
        'HNH(AÄEIOÖUÜY)-1', 'NH', None,
532
        'HOBBY9^', 'HOBI', None,
533
        'HOCHBEGAB-----^', 'HOCH ', 'UK ',
534
        'HOCHTALEN-----^', 'HOCH ', 'UK ',
535
        'HOCHZUFRI-----^', 'HOCH ', 'UK ',
536
        'HO(HIY)--1', 'H', None,
537
        'HRH(AÄEIOÖUÜY)-1', 'RH', None,
538
        'HUH(AÄEIOÖUÜY)-1', 'UH', None,
539
        'HUIS^^', 'HÜS', 'IZ',
540
        'HUIS$', 'ÜS', 'IZ',
541
        'HUI--1', 'H', None,
542
        'HYGIEN^', 'HÜKIEN', None,
543
        'HY9^', 'HÜ', None,
544
        'HY(BDGMNPST)-', 'Ü', None,
545
        'H.^', None, 'H.',
546
        'HÄU--1', 'H', None,
547
        'H^', 'H', '',
548
        'H', '', '',
549
        'ICHELL---', 'ISH', 'IZ',
550
        'ICHI$', 'ISHI', 'IZI',
551
        'IEC$', 'IZ', 'IZ',
552
        'IEDENSTELLE------', 'IDN ', 'ITN ',
553
        'IEI-3', '', '',
554
        'IELL3', 'IEL', 'IEL',
555
        'IENNE$', 'IN', 'IN',
556
        'IERRE$', 'IER', 'IER',
557
        'IERZULAN---', 'IR ZU ', 'IR ZU ',
558
        'IETTE$', 'IT', 'IT',
559
        'IEU', 'IÖ', 'IÖ',
560
        'IE<4', 'I', 'I',
561
        'IGL-1', 'IK', None,
562
        'IGHT3$', 'EIT', 'EIT',
563
        'IGNI(EO)-', 'INI', 'INI',
564
        'IGN(AEOU)-$', 'INI', 'INI',
565
        'IHER(DGLKRT)--1', 'IHE', None,
566
        'IHE(IUY)--', 'IH', None,
567
        'IH(AIOÖUÜY)-', 'IH', None,
568
        'IJ(AOU)-', 'I', 'I',
569
        'IJ$', 'I', 'I',
570
        'IJ<', 'EI', 'EI',
571
        'IKOLE$', 'IKOL', 'IKUL',
572
        'ILLAN(STZ)--4', 'ILIA', 'ILIA',
573
        'ILLAR(DT)--4', 'ILIA', 'ILIA',
574
        'IMSTAN----^', 'IM ', 'IN ',
575
        'INDELERREGE------', 'INDL ', 'INTL ',
576
        'INFRAGE-----^$', 'IN ', 'IN ',
577
        'INTERN(AOU)-^', 'INTAN', 'INTAN',
578
        'INVER-', 'INWE', 'INFE',
579
        'ITI(AÄIOÖUÜ)-', 'IZI', 'IZI',
580
        'IUSZ$', 'IUS', None,
581
        'IUTZ$', 'IUS', None,
582
        'IUZ$', 'IUS', None,
583
        'IVER--<', 'IW', None,
584
        'IVIER$', 'IWIE', 'IFIE',
585
        'IV(ÄOÖUÜ)-', 'IW', None,
586
        'IV<3', 'IW', None,
587
        'IY2', 'I', None,
588
        'I(ÈÉÊ)<4', 'I', 'I',
589
        'JAVIE---<^', 'ZA', 'ZA',
590
        'JEANS^$', 'JINS', 'INZ',
591
        'JEANNE^$', 'IAN', 'IAN',
592
        'JEAN-^', 'IA', 'IA',
593
        'JER-^', 'IE', 'IE',
594
        'JE(LMNST)-', 'IE', 'IE',
595
        'JI^', 'JI', None,
596
        'JOR(GK)^$', 'IÖRK', 'IÖRK',
597
        'J', 'I', 'I',
598
        'KC(ÄEIJ)-', 'X', 'X',
599
        'KD', 'KT', None,
600
        'KE(LMNRST)-3^', 'KE', 'KE',
601
        'KG(AÄEILOÖRUÜY)-', 'K', None,
602
        'KH<^', 'K', 'K',
603
        'KIC$', 'KIZ', 'KIZ',
604
        'KLE(LMNRST)-3^', 'KLE', 'KLE',
605
        'KOTELE-^', 'KOTL', 'KUTL',
606
        'KREAT-^', 'KREA', 'KREA',
607
        'KRÜS(TZ)--^', 'KRI', None,
608
        'KRYS(TZ)--^', 'KRI', None,
609
        'KRY9^', 'KRÜ', None,
610
        'KSCH---', 'K', 'K',
611
        'KSH--', 'K', 'K',
612
        'K(SßXZ)7', 'X', 'X',  # implies 'KST' -> 'XT'
613
        "KT'S$", 'X', 'X',
614
        'KTI(AIOU)-3', 'XI', 'XI',
615
        'KT(SßXZ)', 'X', 'X',
616
        'KY9^', 'KÜ', None,
617
        "K'S$", 'X', 'X',
618
        'K´S$', 'X', 'X',
619
        'LANGES$', ' LANGES', ' LANKEZ',
620
        'LANGE$', ' LANGE', ' LANKE',
621
        'LANG$', ' LANK', ' LANK',
622
        'LARVE-', 'LARF', 'LARF',
623
        'LD(SßZ)$', 'LS', 'LZ',
624
        "LD'S$", 'LS', 'LZ',
625
        'LD´S$', 'LS', 'LZ',
626
        'LEAND-^', 'LEAN', 'LEAN',
627
        'LEERSTEHE-----^', 'LER ', 'LER ',
628
        'LEICHBLEIB-----', 'LEICH ', 'LEIK ',
629
        'LEICHLAUTE-----', 'LEICH ', 'LEIK ',
630
        'LEIDERREGE------', 'LEIT ', 'LEIT ',
631
        'LEIDGEPR----^', 'LEIT ', 'LEIT ',
632
        'LEINSTEHE-----', 'LEIN ', 'LEIN ',
633
        'LEL-', 'LE', 'LE',
634
        'LE(MNRST)-3^', 'LE', 'LE',
635
        'LETTE$', 'LET', 'LET',
636
        'LFGNAG-', 'LFGAN', 'LFKAN',
637
        'LICHERWEIS----', 'LICHA ', 'LIKA ',
638
        'LIC$', 'LIZ', 'LIZ',
639
        'LIVE^$', 'LEIF', 'LEIF',
640
        'LT(SßZ)$', 'LS', 'LZ',
641
        "LT'S$", 'LS', 'LZ',
642
        'LT´S$', 'LS', 'LZ',
643
        'LUI(GS)--', 'LU', 'LU',
644
        'LV(AIO)-', 'LW', None,
645
        'LY9^', 'LÜ', None,
646
        'LSTS$', 'LS', 'LZ',
647
        'LZ(BDFGKLMNPQRSTVWX)-', 'LS', None,
648
        'L(SßZ)$', 'LS', None,
649
        'MAIR-<', 'MEI', 'NEI',
650
        'MANAG-', 'MENE', 'NENE',
651
        'MANUEL', 'MANUEL', None,
652
        'MASSEU(RS)-', 'MASÖ', 'NAZÖ',
653
        'MATCH', 'MESH', 'NEZ',
654
        'MAURICE', 'MORIS', 'NURIZ',
655
        'MBH^$', 'MBH', 'MBH',
656
        'MB(ßZ)$', 'MS', None,
657
        'MB(SßTZ)-', 'M', 'N',
658
        'MCG9^', 'MAK', 'NAK',
659
        'MC9^', 'MAK', 'NAK',
660
        'MEMOIR-^', 'MEMOA', 'NENUA',
661
        'MERHAVEN$', 'MAHAFN', None,
662
        'ME(LMNRST)-3^', 'ME', 'NE',
663
        'MEN(STZ)--3', 'ME', None,
664
        'MEN$', 'MEN', None,
665
        'MIGUEL-', 'MIGE', 'NIKE',
666
        'MIKE^$', 'MEIK', 'NEIK',
667
        'MITHILFE----^$', 'MIT H', 'NIT ',
668
        'MN$', 'M', None,
669
        'MN', 'N', 'N',
670
        'MPJUTE-', 'MPUT', 'NBUT',
671
        'MP(ßZ)$', 'MS', None,
672
        'MP(SßTZ)-', 'M', 'N',
673
        'MP(BDJLMNPQVW)-', 'MB', 'NB',
674
        'MY9^', 'MÜ', None,
675
        'M(ßZ)$', 'MS', None,
676
        'M´G7^', 'MAK', 'NAK',
677
        "M'G7^", 'MAK', 'NAK',
678
        'M´^', 'MAK', 'NAK',
679
        "M'^", 'MAK', 'NAK',
680
        'M', None, 'N',
681
        'NACH^^', 'NACH', 'NAK',
682
        'NADINE', 'NADIN', 'NATIN',
683
        'NAIV--', 'NA', 'NA',
684
        'NAISE$', 'NESE', 'NEZE',
685
        'NAUGENOMM------', 'NAU ', 'NAU ',
686
        'NAUSOGUT$', 'NAUSO GUT', 'NAUZU KUT',
687
        'NCH$', 'NSH', 'NZ',
688
        'NCOISE$', 'SOA', 'ZUA',
689
        'NCOIS$', 'SOA', 'ZUA',
690
        'NDAR$', 'NDA', 'NTA',
691
        'NDERINGEN------', 'NDE ', 'NTE ',
692
        'NDRO(CDKTZ)-', 'NTRO', None,
693
        'ND(BFGJLMNPQVW)-', 'NT', None,
694
        'ND(SßZ)$', 'NS', 'NZ',
695
        "ND'S$", 'NS', 'NZ',
696
        'ND´S$', 'NS', 'NZ',
697
        'NEBEN^^', 'NEBN', 'NEBN',
698
        'NENGELERN------', 'NEN ', 'NEN ',
699
        'NENLERN(ET)---', 'NEN LE', 'NEN LE',
700
        'NENZULERNE---', 'NEN ZU LE', 'NEN ZU LE',
701
        'NE(LMNRST)-3^', 'NE', 'NE',
702
        'NEN-3', 'NE', 'NE',
703
        'NETTE$', 'NET', 'NET',
704
        'NGU^^', 'NU', 'NU',
705
        'NG(BDFJLMNPQRTVW)-', 'NK', 'NK',
706
        'NH(AUO)-$', 'NI', 'NI',
707
        'NICHTSAHNEN-----', 'NIX ', 'NIX ',
708
        'NICHTSSAGE----', 'NIX ', 'NIX ',
709
        'NICHTS^^', 'NIX', 'NIX',
710
        'NICHT^^', 'NICHT', 'NIKT',
711
        'NINE$', 'NIN', 'NIN',
712
        'NON^^', 'NON', 'NUN',
713
        'NOTLEIDE-----^', 'NOT ', 'NUT ',
714
        'NOT^^', 'NOT', 'NUT',
715
        'NTI(AIOU)-3', 'NZI', 'NZI',
716
        'NTIEL--3', 'NZI', 'NZI',
717
        'NT(SßZ)$', 'NS', 'NZ',
718
        "NT'S$", 'NS', 'NZ',
719
        'NT´S$', 'NS', 'NZ',
720
        'NYLON', 'NEILON', 'NEILUN',
721
        'NY9^', 'NÜ', None,
722
        'NSTZUNEH---', 'NST ZU ', 'NZT ZU ',
723
        'NSZ-', 'NS', None,
724
        'NSTS$', 'NS', 'NZ',
725
        'NZ(BDFGKLMNPQRSTVWX)-', 'NS', None,
726
        'N(SßZ)$', 'NS', None,
727
        'OBERE-', 'OBER', None,
728
        'OBER^^', 'OBA', 'UBA',
729
        'OEU2', 'Ö', 'Ö',
730
        'OE<2', 'Ö', 'Ö',
731
        'OGL-', 'OK', None,
732
        'OGNIE-', 'ONI', 'UNI',
733
        'OGN(AEOU)-$', 'ONI', 'UNI',
734
        'OH(AIOÖUÜY)-', 'OH', None,
735
        'OIE$', 'Ö', 'Ö',
736
        'OIRE$', 'OA', 'UA',
737
        'OIR$', 'OA', 'UA',
738
        'OIX', 'OA', 'UA',
739
        'OI<3', 'EU', 'EU',
740
        'OKAY^$', 'OKE', 'UKE',
741
        'OLYN$', 'OLIN', 'ULIN',
742
        'OO(DLMZ)-', 'U', None,
743
        'OO$', 'U', None,
744
        'OO-', '', '',
745
        'ORGINAL-----', 'ORI', 'URI',
746
        'OTI(AÄOÖUÜ)-', 'OZI', 'UZI',
747
        'OUI^', 'WI', 'FI',
748
        'OUILLE$', 'ULIE', 'ULIE',
749
        'OU(DT)-^', 'AU', 'AU',
750
        'OUSE$', 'AUS', 'AUZ',
751
        'OUT-', 'AU', 'AU',
752
        'OU', 'U', 'U',
753
        'O(FV)$', 'AU', 'AU',  # due to 'OW$' -> 'AU'
754
        'OVER--<', 'OW', None,
755
        'OV(AOU)-', 'OW', None,
756
        'OW$', 'AU', 'AU',
757
        'OWS$', 'OS', 'UZ',
758
        'OJ(AÄEIOÖUÜ)--', 'O', 'U',
759
        'OYER', 'OIA', None,
760
        'OY(AÄEIOÖUÜ)--', 'O', 'U',
761
        'O(JY)<', 'EU', 'EU',
762
        'OZ$', 'OS', None,
763
        'O´^', 'O', 'U',
764
        "O'^", 'O', 'U',
765
        'O', None, 'U',
766
        'PATIEN--^', 'PAZI', 'PAZI',
767
        'PENSIO-^', 'PANSI', 'PANZI',
768
        'PE(LMNRST)-3^', 'PE', 'PE',
769
        'PFER-^', 'FE', 'FE',
770
        'P(FH)<', 'F', 'F',
771
        'PIC^$', 'PIK', 'PIK',
772
        'PIC$', 'PIZ', 'PIZ',
773
        'PIPELINE', 'PEIBLEIN', 'PEIBLEIN',
774
        'POLYP-', 'POLÜ', None,
775
        'POLY^^', 'POLI', 'PULI',
776
        'PORTRAIT7', 'PORTRE', 'PURTRE',
777
        'POWER7', 'PAUA', 'PAUA',
778
        'PP(FH)--<', 'B', 'B',
779
        'PP-', '', '',
780
        'PRODUZ-^', 'PRODU', 'BRUTU',
781
        'PRODUZI--', ' PRODU', ' BRUTU',
782
        'PRIX^$', 'PRI', 'PRI',
783
        'PS-^^', 'P', None,
784
        'P(SßZ)^', None, 'Z',
785
        'P(SßZ)$', 'BS', None,
786
        'PT-^', '', '',
787
        'PTI(AÄOÖUÜ)-3', 'BZI', 'BZI',
788
        'PY9^', 'PÜ', None,
789
        'P(AÄEIOÖRUÜY)-', 'P', 'P',
790
        'P(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'P', None,
791
        'P.^', None, 'P.',
792
        'P^', 'P', None,
793
        'P', 'B', 'B',
794
        'QI-', 'Z', 'Z',
795
        'QUARANT--', 'KARA', 'KARA',
796
        'QUE(LMNRST)-3', 'KWE', 'KFE',
797
        'QUE$', 'K', 'K',
798
        'QUI(NS)$', 'KI', 'KI',
799
        'QUIZ7', 'KWIS', None,
800
        'Q(UV)7', 'KW', 'KF',
801
        'Q<', 'K', 'K',
802
        'RADFAHR----', 'RAT ', 'RAT ',
803
        'RAEFTEZEHRE-----', 'REFTE ', 'REFTE ',
804
        'RCH', 'RCH', 'RK',
805
        'REA(DU)---3^', 'R', None,
806
        'REBSERZEUG------', 'REBS ', 'REBZ ',
807
        'RECHERCH^', 'RESHASH', 'REZAZ',
808
        'RECYCL--', 'RIZEI', 'RIZEI',
809
        'RE(ALST)-3^', 'RE', None,
810
        'REE$', 'RI', 'RI',
811
        'RER$', 'RA', 'RA',
812
        'RE(MNR)-4', 'RE', 'RE',
813
        'RETTE$', 'RET', 'RET',
814
        'REUZ$', 'REUZ', None,
815
        'REW$', 'RU', 'RU',
816
        'RH<^', 'R', 'R',
817
        'RJA(MN)--', 'RI', 'RI',
818
        'ROWD-^', 'RAU', 'RAU',
819
        'RTEMONNAIE-', 'RTMON', 'RTNUN',
820
        'RTI(AÄOÖUÜ)-3', 'RZI', 'RZI',
821
        'RTIEL--3', 'RZI', 'RZI',
822
        'RV(AEOU)-3', 'RW', None,
823
        'RY(KN)-$', 'RI', 'RI',
824
        'RY9^', 'RÜ', None,
825
        'RÄFTEZEHRE-----', 'REFTE ', 'REFTE ',
826
        'SAISO-^', 'SES', 'ZEZ',
827
        'SAFE^$', 'SEIF', 'ZEIF',
828
        'SAUCE-^', 'SOS', 'ZUZ',
829
        'SCHLAGGEBEN-----<', 'SHLAK ', 'ZLAK ',
830
        'SCHSCH---7', '', '',
831
        'SCHTSCH', 'SH', 'Z',
832
        'SC(HZ)<', 'SH', 'Z',
833
        'SC', 'SK', 'ZK',
834
        'SELBSTST--7^^', 'SELB', 'ZELB',
835
        'SELBST7^^', 'SELBST', 'ZELBZT',
836
        'SERVICE7^', 'SÖRWIS', 'ZÖRFIZ',
837
        'SERVI-^', 'SERW', None,
838
        'SE(LMNRST)-3^', 'SE', 'ZE',
839
        'SETTE$', 'SET', 'ZET',
840
        'SHP-^', 'S', 'Z',
841
        'SHST', 'SHT', 'ZT',
842
        'SHTSH', 'SH', 'Z',
843
        'SHT', 'ST', 'Z',
844
        'SHY9^', 'SHÜ', None,
845
        'SH^^', 'SH', None,
846
        'SH3', 'SH', 'Z',
847
        'SICHERGEGAN-----^', 'SICHA ', 'ZIKA ',
848
        'SICHERGEHE----^', 'SICHA ', 'ZIKA ',
849
        'SICHERGESTEL------^', 'SICHA ', 'ZIKA ',
850
        'SICHERSTELL-----^', 'SICHA ', 'ZIKA ',
851
        'SICHERZU(GS)--^', 'SICHA ZU ', 'ZIKA ZU ',
852
        'SIEGLI-^', 'SIKL', 'ZIKL',
853
        'SIGLI-^', 'SIKL', 'ZIKL',
854
        'SIGHT', 'SEIT', 'ZEIT',
855
        'SIGN', 'SEIN', 'ZEIN',
856
        'SKI(NPZ)-', 'SKI', 'ZKI',
857
        'SKI<^', 'SHI', 'ZI',
858
        'SODASS^$', 'SO DAS', 'ZU TAZ',
859
        'SODAß^$', 'SO DAS', 'ZU TAZ',
860
        'SOGENAN--^', 'SO GEN', 'ZU KEN',
861
        'SOUND-', 'SAUN', 'ZAUN',
862
        'STAATS^^', 'STAZ', 'ZTAZ',
863
        'STADT^^', 'STAT', 'ZTAT',
864
        'STANDE$', ' STANDE', ' ZTANTE',
865
        'START^^', 'START', 'ZTART',
866
        'STAURANT7', 'STORAN', 'ZTURAN',
867
        'STEAK-', 'STE', 'ZTE',
868
        'STEPHEN-^$', 'STEW', None,
869
        'STERN', 'STERN', None,
870
        'STRAF^^', 'STRAF', 'ZTRAF',
871
        "ST'S$", 'Z', 'Z',
872
        'ST´S$', 'Z', 'Z',
873
        'STST--', '', '',
874
        'STS(ACEÈÉÊHIÌÍÎOUÄÜÖ)--', 'ST', 'ZT',
875
        'ST(SZ)', 'Z', 'Z',
876
        'SPAREN---^', 'SPA', 'ZPA',
877
        'SPAREND----', ' SPA', ' ZPA',
878
        'S(PTW)-^^', 'S', None,
879
        'SP', 'SP', None,
880
        'STYN(AE)-$', 'STIN', 'ZTIN',
881
        'ST', 'ST', 'ZT',
882
        'SUITE<', 'SIUT', 'ZIUT',
883
        'SUKE--$', 'S', 'Z',
884
        'SURF(EI)-', 'SÖRF', 'ZÖRF',
885
        'SV(AEÈÉÊIÌÍÎOU)-<^', 'SW', None,
886
        'SYB(IY)--^', 'SIB', None,
887
        'SYL(KVW)--^', 'SI', None,
888
        'SY9^', 'SÜ', None,
889
        'SZE(NPT)-^', 'ZE', 'ZE',
890
        'SZI(ELN)-^', 'ZI', 'ZI',
891
        'SZCZ<', 'SH', 'Z',
892
        'SZT<', 'ST', 'ZT',
893
        'SZ<3', 'SH', 'Z',
894
        'SÜL(KVW)--^', 'SI', None,
895
        'S', None, 'Z',
896
        'TCH', 'SH', 'Z',
897
        'TD(AÄEIOÖRUÜY)-', 'T', None,
898
        'TD(ÀÁÂÃÅÈÉÊËÌÍÎÏÒÓÔÕØÙÚÛÝŸ)-', 'T', None,
899
        'TEAT-^', 'TEA', 'TEA',
900
        'TERRAI7^', 'TERA', 'TERA',
901
        'TE(LMNRST)-3^', 'TE', 'TE',
902
        'TH<', 'T', 'T',
903
        'TICHT-', 'TIK', 'TIK',
904
        'TICH$', 'TIK', 'TIK',
905
        'TIC$', 'TIZ', 'TIZ',
906
        'TIGGESTELL-------', 'TIK ', 'TIK ',
907
        'TIGSTELL-----', 'TIK ', 'TIK ',
908
        'TOAS-^', 'TO', 'TU',
909
        'TOILET-', 'TOLE', 'TULE',
910
        'TOIN-', 'TOA', 'TUA',
911
        'TRAECHTI-^', 'TRECHT', 'TREKT',
912
        'TRAECHTIG--', ' TRECHT', ' TREKT',
913
        'TRAINI-', 'TREN', 'TREN',
914
        'TRÄCHTI-^', 'TRECHT', 'TREKT',
915
        'TRÄCHTIG--', ' TRECHT', ' TREKT',
916
        'TSCH', 'SH', 'Z',
917
        'TSH', 'SH', 'Z',
918
        'TST', 'ZT', 'ZT',
919
        'T(Sß)', 'Z', 'Z',
920
        'TT(SZ)--<', '', '',
921
        'TT9', 'T', 'T',
922
        'TV^$', 'TV', 'TV',
923
        'TX(AEIOU)-3', 'SH', 'Z',
924
        'TY9^', 'TÜ', None,
925
        'TZ-', '', '',
926
        "T'S3$", 'Z', 'Z',
927
        'T´S3$', 'Z', 'Z',
928
        'UEBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
929
        'UEBER^^', 'ÜBA', 'IBA',
930
        'UE2', 'Ü', 'I',
931
        'UGL-', 'UK', None,
932
        'UH(AOÖUÜY)-', 'UH', None,
933
        'UIE$', 'Ü', 'I',
934
        'UM^^', 'UM', 'UN',
935
        'UNTERE--3', 'UNTE', 'UNTE',
936
        'UNTER^^', 'UNTA', 'UNTA',
937
        'UNVER^^', 'UNFA', 'UNFA',
938
        'UN^^', 'UN', 'UN',
939
        'UTI(AÄOÖUÜ)-', 'UZI', 'UZI',
940
        'UVE-4', 'UW', None,
941
        'UY2', 'UI', None,
942
        'UZZ', 'AS', 'AZ',
943
        'VACL-^', 'WAZ', 'FAZ',
944
        'VAC$', 'WAZ', 'FAZ',
945
        'VAN DEN ^', 'FANDN', 'FANTN',
946
        'VANES-^', 'WANE', None,
947
        'VATRO-', 'WATR', None,
948
        'VA(DHJNT)--^', 'F', None,
949
        'VEDD-^', 'FE', 'FE',
950
        'VE(BEHIU)--^', 'F', None,
951
        'VEL(BDLMNT)-^', 'FEL', None,
952
        'VENTZ-^', 'FEN', None,
953
        'VEN(NRSZ)-^', 'FEN', None,
954
        'VER(AB)-^$', 'WER', None,
955
        'VERBAL^$', 'WERBAL', None,
956
        'VERBAL(EINS)-^', 'WERBAL', None,
957
        'VERTEBR--', 'WERTE', None,
958
        'VEREIN-----', 'F', None,
959
        'VEREN(AEIOU)-^', 'WEREN', None,
960
        'VERIFI', 'WERIFI', None,
961
        'VERON(AEIOU)-^', 'WERON', None,
962
        'VERSEN^', 'FERSN', 'FAZN',
963
        'VERSIERT--^', 'WERSI', None,
964
        'VERSIO--^', 'WERS', None,
965
        'VERSUS', 'WERSUS', None,
966
        'VERTI(GK)-', 'WERTI', None,
967
        'VER^^', 'FER', 'FA',
968
        'VERSPRECHE-------', ' FER', ' FA',
969
        'VER$', 'WA', None,
970
        'VER', 'FA', 'FA',
971
        'VET(HT)-^', 'FET', 'FET',
972
        'VETTE$', 'WET', 'FET',
973
        'VE^', 'WE', None,
974
        'VIC$', 'WIZ', 'FIZ',
975
        'VIELSAGE----', 'FIL ', 'FIL ',
976
        'VIEL', 'FIL', 'FIL',
977
        'VIEW', 'WIU', 'FIU',
978
        'VILL(AE)-', 'WIL', None,
979
        'VIS(ACEIKUVWZ)-<^', 'WIS', None,
980
        'VI(ELS)--^', 'F', None,
981
        'VILLON--', 'WILI', 'FILI',
982
        'VIZE^^', 'FIZE', 'FIZE',
983
        'VLIE--^', 'FL', None,
984
        'VL(AEIOU)--', 'W', None,
985
        'VOKA-^', 'WOK', None,
986
        'VOL(ATUVW)--^', 'WO', None,
987
        'VOR^^', 'FOR', 'FUR',
988
        'VR(AEIOU)--', 'W', None,
989
        'VV9', 'W', None,
990
        'VY9^', 'WÜ', 'FI',
991
        'V(ÜY)-', 'W', None,
992
        'V(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'W', None,
993
        'V(AEIJLRU)-<', 'W', None,
994
        'V.^', 'V.', None,
995
        'V<', 'F', 'F',
996
        'WEITERENTWI-----^', 'WEITA ', 'FEITA ',
997
        'WEITREICH-----^', 'WEIT ', 'FEIT ',
998
        'WEITVER^', 'WEIT FER', 'FEIT FA',
999
        'WE(LMNRST)-3^', 'WE', 'FE',
1000
        'WER(DST)-', 'WER', None,
1001
        'WIC$', 'WIZ', 'FIZ',
1002
        'WIEDERU--', 'WIDE', 'FITE',
1003
        'WIEDER^$', 'WIDA', 'FITA',
1004
        'WIEDER^^', 'WIDA ', 'FITA ',
1005
        'WIEVIEL', 'WI FIL', 'FI FIL',
1006
        'WISUEL', 'WISUEL', None,
1007
        'WR-^', 'W', None,
1008
        'WY9^', 'WÜ', 'FI',
1009
        'W(BDFGJKLMNPQRSTZ)-', 'F', None,
1010
        'W$', 'F', None,
1011
        'W', None, 'F',
1012
        'X<^', 'Z', 'Z',
1013
        'XHAVEN$', 'XAFN', None,
1014
        'X(CSZ)', 'X', 'X',
1015
        'XTS(CH)--', 'XT', 'XT',
1016
        'XT(SZ)', 'Z', 'Z',
1017
        'YE(LMNRST)-3^', 'IE', 'IE',
1018
        'YE-3', 'I', 'I',
1019
        'YOR(GK)^$', 'IÖRK', 'IÖRK',
1020
        'Y(AOU)-<7', 'I', 'I',
1021
        'Y(BKLMNPRSTX)-1', 'Ü', None,
1022
        'YVES^$', 'IF', 'IF',
1023
        'YVONNE^$', 'IWON', 'IFUN',
1024
        'Y.^', 'Y.', None,
1025
        'Y', 'I', 'I',
1026
        'ZC(AOU)-', 'SK', 'ZK',
1027
        'ZE(LMNRST)-3^', 'ZE', 'ZE',
1028
        'ZIEJ$', 'ZI', 'ZI',
1029
        'ZIGERJA(HR)-3', 'ZIGA IA', 'ZIKA IA',
1030
        'ZL(AEIOU)-', 'SL', None,
1031
        'ZS(CHT)--', '', '',
1032
        'ZS', 'SH', 'Z',
1033
        'ZUERST', 'ZUERST', 'ZUERST',
1034
        'ZUGRUNDE^$', 'ZU GRUNDE', 'ZU KRUNTE',
1035
        'ZUGRUNDE', 'ZU GRUNDE ', 'ZU KRUNTE ',
1036
        'ZUGUNSTEN', 'ZU GUNSTN', 'ZU KUNZTN',
1037
        'ZUHAUSE-', 'ZU HAUS', 'ZU AUZ',
1038
        'ZULASTEN^$', 'ZU LASTN', 'ZU LAZTN',
1039
        'ZURUECK^^', 'ZURÜK', 'ZURIK',
1040
        'ZURZEIT', 'ZUR ZEIT', 'ZUR ZEIT',
1041
        'ZURÜCK^^', 'ZURÜK', 'ZURIK',
1042
        'ZUSTANDE', 'ZU STANDE', 'ZU ZTANTE',
1043
        'ZUTAGE', 'ZU TAGE', 'ZU TAKE',
1044
        'ZUVER^^', 'ZUFA', 'ZUFA',
1045
        'ZUVIEL', 'ZU FIL', 'ZU FIL',
1046
        'ZUWENIG', 'ZU WENIK', 'ZU FENIK',
1047
        'ZY9^', 'ZÜ', None,
1048
        'ZYK3$', 'ZIK', None,
1049
        'Z(VW)7^', 'SW', None,
1050
        None, None, None
1051
        # fmt: on
1052
    )  # type: Tuple[Optional[str], ...]
1053
1054
    _upper_trans = dict(
1055
        zip(
1056
            (
1057
                ord(_)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
1058
                for _ in 'abcdefghijklmnopqrstuvwxyzàáâãåäæ'
1059
                + 'çðèéêëìíîïñòóôõöøœšßþùúûüýÿ'
1060
            ),
1061
            'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÅÄÆ'
1062
            + 'ÇÐÈÉÊËÌÍÎÏÑÒÓÔÕÖØŒŠßÞÙÚÛÜÝŸ',
1063
        )
1064
    )
1065
1066
    def __init__(self, mode: int = 1, lang: str = 'de') -> None:
1067
        """Initialize AlphaSIS instance.
1068 1
1069
        Parameters
1070
        ----------
1071
        mode : int
1072
            The ponet variant to employ (1 or 2)
1073
        lang : str
1074
            ``de`` (default) for German, ``none`` for no language
1075
1076
1077
        .. versionadded:: 0.4.0
1078
1079
        """
1080 1
        self._mode = mode
1081
        self._lang = lang
1082
1083
    def encode(self, word: str) -> str:
1084
        """Return the phonet code for a word.
1085
1086
        Parameters
1087
        ----------
1088
        word : str
1089
            The word to transform
1090
1091
        Returns
1092
        -------
1093
        str
1094 1
            The phonet value
1095 1
1096
        Examples
1097 1
        --------
1098
        >>> pe = Phonet()
1099
        >>> pe.encode('Christopher')
1100
        'KRISTOFA'
1101
        >>> pe.encode('Niall')
1102
        'NIAL'
1103
        >>> pe.encode('Smith')
1104
        'SMIT'
1105
        >>> pe.encode('Schmidt')
1106
        'SHMIT'
1107
1108
        >>> pe2 = Phonet(mode=2)
1109
        >>> pe2.encode('Christopher')
1110
        'KRIZTUFA'
1111
        >>> pe2.encode('Niall')
1112
        'NIAL'
1113
        >>> pe2.encode('Smith')
1114
        'ZNIT'
1115
        >>> pe2.encode('Schmidt')
1116
        'ZNIT'
1117
1118
        >>> pe_none = Phonet(lang='none')
1119
        >>> pe_none.encode('Christopher')
1120
        'CHRISTOPHER'
1121
        >>> pe_none.encode('Niall')
1122
        'NIAL'
1123
        >>> pe_none.encode('Smith')
1124
        'SMITH'
1125
        >>> pe_none.encode('Schmidt')
1126
        'SCHMIDT'
1127
1128
1129
        .. versionadded:: 0.1.0
1130
        .. versionchanged:: 0.3.6
1131
            Encapsulated in class
1132
1133
        """
1134
        phonet_hash = Counter()  # type: TCounter[str]
1135
        alpha_pos = Counter()  # type: TCounter[str]
1136
1137
        phonet_hash_1 = Counter()  # type: TCounter[Tuple[int, int]]
1138
        phonet_hash_2 = Counter()  # type: TCounter[Tuple[int, int]]
1139
1140
        def _initialize_phonet(lang: str) -> None:
1141
            """Initialize phonet variables.
1142
1143
            Parameters
1144
            ----------
1145
            lang : str
1146
                Language to use for rules
1147
1148 1
            .. versionadded:: 0.1.0
1149 1
1150
            """
1151 1
            if lang == 'none':
1152 1
                _phonet_rules = self._rules_no_lang
1153
            else:
1154 1
                _phonet_rules = self._rules_german
1155
1156
            phonet_hash[''] = -1
1157
1158
            # German and international umlauts
1159
            for ch in {
1160
                'À',
1161
                'Á',
1162
                'Â',
1163
                'Ã',
1164
                'Ä',
1165 1
                'Å',
1166 1
                'Æ',
1167
                'Ç',
1168 1
                'È',
1169
                'É',
1170 1
                'Ê',
1171
                'Ë',
1172
                'Ì',
1173 1
                'Í',
1174
                'Î',
1175
                'Ï',
1176
                'Ð',
1177
                'Ñ',
1178
                'Ò',
1179
                'Ó',
1180
                'Ô',
1181
                'Õ',
1182
                'Ö',
1183
                'Ø',
1184
                'Ù',
1185
                'Ú',
1186
                'Û',
1187
                'Ü',
1188
                'Ý',
1189
                'Þ',
1190
                'ß',
1191
                'Œ',
1192
                'Š',
1193
                'Ÿ',
1194
            }:
1195
                alpha_pos[ch] = 1
1196
                phonet_hash[ch] = -1
1197
1198
            # "normal" letters ('A'-'Z')
1199
            for i, ch in enumerate('ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
1200
                alpha_pos[ch] = i + 2
1201
                phonet_hash[ch] = -1
1202
1203
            for i in range(26):
1204
                for j in range(28):
1205
                    phonet_hash_1[i, j] = -1
1206
                    phonet_hash_2[i, j] = -1
1207
1208
            # for each phonetc rule
1209 1
            for i in range(len(_phonet_rules)):
1210 1
                rule = _phonet_rules[i]
1211
1212
                if rule and i % 3 == 0:
1213 1
                    # calculate first hash value
1214 1
                    ch = cast(str, _phonet_rules[i])[0]
1215 1
1216
                    if phonet_hash[ch] < 0 and (
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash does not seem to be defined.
Loading history...
1217 1
                        cast(str, _phonet_rules[i + 1])
1218 1
                        or cast(str, _phonet_rules[i + 2])
1219 1
                    ):
1220 1
                        phonet_hash[ch] = i
1221
1222
                    # calculate second hash values
1223 1
                    if ch and alpha_pos[ch] >= 2:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable alpha_pos does not seem to be defined.
Loading history...
1224 1
                        k = alpha_pos[ch]
1225
1226 1
                        j = k - 2
1227
                        rule = rule[1:]
1228 1
1229
                        if not rule:
1230 1
                            rule = ' '
1231
                        elif rule[0] == '(':
1232
                            rule = rule[1:]
1233 1
                        else:
1234
                            rule = rule[0]
1235
1236 1
                        while rule and (rule[0] != ')'):
1237 1
                            k = alpha_pos[rule[0]]
1238
1239 1
                            if k > 0:
1240 1
                                # add hash value for this letter
1241
                                if phonet_hash_1[j, k] < 0:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash_1 does not seem to be defined.
Loading history...
1242 1
                                    phonet_hash_1[j, k] = i
1243 1
                                    phonet_hash_2[j, k] = i
1244 1
1245 1
                                if phonet_hash_2[j, k] >= (i - 30):
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash_2 does not seem to be defined.
Loading history...
1246
                                    phonet_hash_2[j, k] = i
1247 1
                                else:
1248
                                    k = -1
1249 1
1250 1
                            if k <= 0:
1251
                                # add hash value for all letters
1252 1
                                if phonet_hash_1[j, 0] < 0:
1253
                                    phonet_hash_1[j, 0] = i
1254 1
1255 1
                                phonet_hash_2[j, 0] = i
1256 1
1257
                            rule = rule[1:]
1258 1
1259 1
        def _phonet(term: str, mode: int, lang: str) -> str:
1260
            """Return the phonet coded form of a term.
1261 1
1262
            Parameters
1263 1
            ----------
1264
            term : str
1265 1
                Term to transform
1266 1
            mode : int
1267
                The ponet variant to employ (1 or 2)
1268 1
            lang : str
1269
                ``de`` (default) for German, ``none`` for no language
1270 1
1271
            Returns
1272 1
            -------
1273
            str
1274
                The phonet value
1275
1276
            .. versionadded:: 0.1.0
1277
1278
            """
1279
            if lang == 'none':
1280
                _phonet_rules = self._rules_no_lang
1281
            else:
1282
                _phonet_rules = self._rules_german
1283
1284
            char0 = ''
1285
            dest = term
1286
1287
            if not term:
1288
                return ''
1289
1290
            term_length = len(term)
1291
1292 1
            # convert input string to upper-case
1293 1
            src = term.translate(self._upper_trans)
1294
1295 1
            # check "src"
1296
            i = 0
1297 1
            j = 0
1298 1
            zeta = 0
1299
1300 1
            while i < len(src):
1301 1
                char = src[i]
1302
1303 1
                pos = alpha_pos[char]
1304
1305
                if pos >= 2:
1306 1
                    xpos = pos - 2
1307
1308
                    if i + 1 == len(src):
1309 1
                        pos = alpha_pos['']
1310 1
                    else:
1311 1
                        pos = alpha_pos[src[i + 1]]
1312
1313 1
                    start1 = phonet_hash_1[xpos, pos]
1314 1
                    start2 = phonet_hash_1[xpos, 0]
1315
                    end1 = phonet_hash_2[xpos, pos]
1316 1
                    end2 = phonet_hash_2[xpos, 0]
1317
1318 1
                    # preserve rule priorities
1319 1
                    if (start2 >= 0) and ((start1 < 0) or (start2 < start1)):
1320
                        pos = start1
1321 1
                        start1 = start2
1322 1
                        start2 = pos
1323
                        pos = end1
1324 1
                        end1 = end2
1325
                        end2 = pos
1326 1
1327 1
                    if (end1 >= start2) and (start2 >= 0):
1328 1
                        if end2 > end1:
1329 1
                            end1 = end2
1330
1331
                        start2 = -1
1332 1
                        end2 = -1
1333 1
                else:
1334 1
                    pos = phonet_hash[char]
1335 1
                    start1 = pos
1336 1
                    end1 = 10000
1337 1
                    start2 = -1
1338 1
                    end2 = -1
1339
1340 1
                pos = start1
1341 1
                zeta0 = 0
1342 1
1343
                if pos >= 0:
1344 1
                    # check rules for this char
1345 1
                    while (_phonet_rules[pos] is None) or (
1346
                        cast(str, _phonet_rules[pos])[0] == char
1347 1
                    ):
1348 1
                        if pos > end1:
1349 1
                            if start2 > 0:
1350 1
                                pos = start2
1351 1
                                start1 = start2
1352
                                start2 = -1
1353 1
                                end1 = end2
1354 1
                                end2 = -1
1355
                                continue
1356 1
1357
                            break
1358 1
1359
                        if (_phonet_rules[pos] is None) or (
1360
                            _phonet_rules[pos + mode] is None
1361 1
                        ):
1362 1
                            # no conversion rule available
1363 1
                            pos += 3
1364 1
                            continue
1365 1
1366 1
                        # check whole string
1367 1
                        matches = 1  # number of matching letters
1368 1
                        priority = 5  # default priority
1369
                        rule = cast(str, _phonet_rules[pos])[1:]
1370 1
1371
                        while (
1372 1
                            rule
1373
                            and (len(src) > (i + matches))
1374
                            and (src[i + matches] == rule[0])
1375
                            and not rule[0].isdigit()
1376 1
                            and (rule not in '(-<^$')
1377 1
                        ):
1378
                            matches += 1
1379
                            rule = rule[1:]
1380 1
1381 1
                        if rule and (rule[0] == '('):
1382 1
                            # check an array of letters
1383 1
                            if (
1384
                                (len(src) > (i + matches))
1385 1
                                and src[i + matches].isalpha()
1386
                                and (src[i + matches] in rule[1:])
1387
                            ):
1388
                                matches += 1
1389
1390
                                while rule and rule[0] != ')':
1391
                                    rule = rule[1:]
1392 1
1393 1
                                # if rule[0] == ')':
1394
                                rule = rule[1:]
1395 1
1396
                        if rule:
1397 1
                            priority0 = ord(rule[0])
1398
                        else:
1399
                            priority0 = 0
1400
1401
                        matches0 = matches
1402 1
1403
                        while rule and rule[0] == '-' and matches > 1:
1404 1
                            matches -= 1
1405 1
                            rule = rule[1:]
1406
1407
                        if rule and rule[0] == '<':
1408 1
                            rule = rule[1:]
1409
1410 1
                        if rule and rule[0].isdigit():
1411 1
                            # read priority
1412
                            priority = int(rule[0])
1413 1
                            rule = rule[1:]
1414
1415 1
                        if rule and rule[0:2] == '^^':
1416
                            rule = rule[1:]
1417 1
1418 1
                        if (
1419 1
                            not rule
1420
                            or (
1421 1
                                (rule[0] == '^')
1422 1
                                and ((i == 0) or not src[i - 1].isalpha())
1423
                                and (
1424 1
                                    (rule[1:2] != '$')
1425
                                    or (
1426 1
                                        not (
1427 1
                                            src[
1428
                                                i + matches0 : i + matches0 + 1
1429 1
                                            ].isalpha()
1430 1
                                        )
1431
                                        and (
1432 1
                                            src[
1433
                                                i + matches0 : i + matches0 + 1
1434
                                            ]
1435
                                            != '.'
1436
                                        )
1437
                                    )
1438
                                )
1439
                            )
1440
                            or (
1441
                                (rule[0] == '$')
1442
                                and (i > 0)
1443
                                and src[i - 1].isalpha()
1444
                                and (
1445
                                    (
1446
                                        not src[
1447
                                            i + matches0 : i + matches0 + 1
1448
                                        ].isalpha()
1449
                                    )
1450
                                    and (
1451
                                        src[i + matches0 : i + matches0 + 1]
1452
                                        != '.'
1453
                                    )
1454
                                )
1455
                            )
1456
                        ):
1457
                            # look for continuation, if:
1458
                            # matches > 1 und NO '-' in first string */
1459
                            pos0 = -1
1460
1461
                            start3 = 0
1462
                            start4 = 0
1463
                            end3 = 0
1464
                            end4 = 0
1465
1466
                            if (
1467
                                (matches > 1)
1468
                                and src[i + matches : i + matches + 1]
1469
                                and (priority0 != ord('-'))
1470
                            ):
1471
                                char0 = src[i + matches - 1]
1472
                                pos0 = alpha_pos[char0]
1473 1
1474
                                if pos0 >= 2 and src[i + matches]:
1475 1
                                    xpos = pos0 - 2
1476 1
                                    pos0 = alpha_pos[src[i + matches]]
1477 1
                                    start3 = phonet_hash_1[xpos, pos0]
1478 1
                                    start4 = phonet_hash_1[xpos, 0]
1479
                                    end3 = phonet_hash_2[xpos, pos0]
1480 1
                                    end4 = phonet_hash_2[xpos, 0]
1481
1482
                                    # preserve rule priorities
1483
                                    if (start4 >= 0) and (
1484
                                        (start3 < 0) or (start4 < start3)
1485 1
                                    ):
1486 1
                                        pos0 = start3
1487
                                        start3 = start4
1488 1
                                        start4 = pos0
1489 1
                                        pos0 = end3
1490 1
                                        end3 = end4
1491 1
                                        end4 = pos0
1492 1
1493 1
                                    if (end3 >= start4) and (start4 >= 0):
1494 1
                                        if end4 > end3:
1495
                                            end3 = end4
1496
1497 1
                                        start4 = -1
1498
                                        end4 = -1
1499
                                else:
1500 1
                                    pos0 = phonet_hash[char0]
1501 1
                                    start3 = pos0
1502 1
                                    end3 = 10000
1503 1
                                    start4 = -1
1504 1
                                    end4 = -1
1505 1
1506
                                pos0 = start3
1507 1
1508 1
                            # check continuation rules for src[i+matches]
1509 1
                            if pos0 >= 0:
1510
                                while (_phonet_rules[pos0] is None) or (
1511 1
                                    cast(str, _phonet_rules[pos0])[0] == char0
1512 1
                                ):
1513
                                    if pos0 > end3:
1514 1
                                        if start4 > 0:
1515 1
                                            pos0 = start4
1516 1
                                            start3 = start4
1517 1
                                            start4 = -1
1518 1
                                            end3 = end4
1519
                                            end4 = -1
1520 1
                                            continue
1521
1522
                                        priority0 = -1
1523 1
1524 1
                                        # important
1525
                                        break
1526
1527 1
                                    if (_phonet_rules[pos0] is None) or (
1528 1
                                        _phonet_rules[pos0 + mode] is None
1529 1
                                    ):
1530 1
                                        # no conversion rule available
1531 1
                                        pos0 += 3
1532 1
                                        continue
1533 1
1534 1
                                    # check whole string
1535
                                    matches0 = matches
1536 1
                                    priority0 = 5
1537
                                    rule = cast(str, _phonet_rules[pos0])[1:]
1538
1539 1
                                    while (
1540
                                        rule
1541 1
                                        and (
1542
                                            src[
1543
                                                i + matches0 : i + matches0 + 1
1544
                                            ]
1545 1
                                            == rule[0]
1546 1
                                        )
1547
                                        and (
1548
                                            not rule[0].isdigit()
1549 1
                                            or (rule in '(-<^$')
1550 1
                                        )
1551 1
                                    ):
1552 1
                                        matches0 += 1
1553
                                        rule = rule[1:]
1554 1
1555
                                    if rule and rule[0] == '(':
1556
                                        # check an array of letters
1557
                                        if src[
1558
                                            i + matches0 : i + matches0 + 1
1559
                                        ].isalpha() and (
1560
                                            src[i + matches0] in rule[1:]
1561
                                        ):
1562
                                            matches0 += 1
1563
1564
                                            while rule and rule[0] != ')':
1565
                                                rule = rule[1:]
1566
1567 1
                                            # if rule[0] == ')':
1568 1
                                            rule = rule[1:]
1569
1570 1
                                    while rule and rule[0] == '-':
1571
                                        # "matches0" is NOT decremented
1572 1
                                        # because of
1573
                                        #    "if (matches0 == matches)"
1574
                                        rule = rule[1:]
1575
1576
                                    if rule and rule[0] == '<':
1577 1
                                        rule = rule[1:]
1578
1579 1
                                    if rule and rule[0].isdigit():
1580 1
                                        priority0 = int(rule[0])
1581
                                        rule = rule[1:]
1582
1583 1
                                    if (
1584
                                        not rule
1585 1
                                        # rule == '^' is not possible here
1586
                                        or (
1587
                                            (rule[0] == '$')
1588
                                            and not src[
1589 1
                                                i + matches0 : i + matches0 + 1
1590
                                            ].isalpha()
1591 1
                                            and (
1592 1
                                                src[
1593
                                                    i
1594 1
                                                    + matches0 : i
1595 1
                                                    + matches0
1596 1
                                                    + 1
1597
                                                ]
1598 1
                                                != '.'
1599
                                            )
1600
                                        )
1601
                                    ):
1602
                                        if matches0 == matches:
1603
                                            # this is only a partial string
1604
                                            pos0 += 3
1605
                                            continue
1606
1607
                                        if priority0 < priority:
1608
                                            # priority is too low
1609
                                            pos0 += 3
1610
                                            continue
1611
1612
                                        # continuation rule found
1613
                                        break
1614
1615
                                    pos0 += 3
1616
1617
                                # end of "while"
1618 1
                                if (priority0 >= priority) and (
1619
                                    (_phonet_rules[pos0] is not None)
1620 1
                                    and (
1621 1
                                        cast(str, _phonet_rules[pos0])[0]
1622
                                        == char0
1623 1
                                    )
1624
                                ):
1625 1
1626 1
                                    pos += 3
1627
                                    continue
1628
1629 1
                            # replace string
1630
                            if _phonet_rules[pos] and (
1631 1
                                '<' in cast(str, _phonet_rules[pos])[1:]
1632
                            ):
1633
                                priority0 = 1
1634 1
                            else:
1635
                                priority0 = 0
1636
1637
                            rule = cast(str, _phonet_rules[pos + mode])
1638
1639 1
                            if (priority0 == 1) and (zeta == 0):
1640 1
                                # rule with '<' is applied
1641
                                if (
1642
                                    (j > 0)
1643 1
                                    and rule
1644
                                    and (
1645
                                        (dest[j - 1] == char)
1646 1
                                        or (dest[j - 1] == rule[0])
1647
                                    )
1648 1
                                ):
1649
                                    j -= 1
1650 1
1651
                                zeta0 = 1
1652 1
                                zeta += 1
1653
                                matches0 = 0
1654 1
1655
                                while rule and src[i + matches0]:
1656
                                    src = (
1657
                                        src[0 : i + matches0]
1658
                                        + rule[0]
1659
                                        + src[i + matches0 + 1 :]
1660
                                    )
1661
                                    matches0 += 1
1662 1
                                    rule = rule[1:]
1663
1664 1
                                if matches0 < matches:
1665 1
                                    src = (
1666 1
                                        src[0 : i + matches0]
1667
                                        + src[i + matches :]
1668 1
                                    )
1669 1
1670
                                char = src[i]
1671
                            else:
1672
                                i = i + matches - 1
1673
                                zeta = 0
1674 1
1675 1
                                while len(rule) > 1:
1676
                                    if (j == 0) or (dest[j - 1] != rule[0]):
1677 1
                                        dest = (
1678 1
                                            dest[0:j]
1679
                                            + rule[0]
1680
                                            + dest[min(len(dest), j + 1) :]
1681
                                        )
1682
                                        j += 1
1683 1
1684
                                    rule = rule[1:]
1685 1
1686 1
                                # new "current char"
1687
                                if not rule:
1688 1
                                    rule = ''
1689 1
                                    char = ''
1690 1
                                else:
1691
                                    char = rule[0]
1692
1693
                                if (
1694
                                    _phonet_rules[pos]
1695 1
                                    and '^^'
1696
                                    in cast(str, _phonet_rules[pos])[1:]
1697 1
                                ):
1698
                                    if char:
1699
                                        dest = (
1700 1
                                            dest[0:j]
1701 1
                                            + char
1702 1
                                            + dest[min(len(dest), j + 1) :]
1703
                                        )
1704 1
                                        j += 1
1705
1706 1
                                    src = src[i + 1 :]
1707
                                    i = 0
1708
                                    zeta0 = 1
1709
1710 1
                            break
1711 1
1712
                        pos += 3
1713
1714
                        if pos > end1 and start2 > 0:
1715
                            pos = start2
1716 1
                            start1 = start2
1717
                            end1 = end2
1718 1
                            start2 = -1
1719 1
                            end2 = -1
1720 1
1721
                if zeta0 == 0:
1722 1
                    if char and ((j == 0) or (dest[j - 1] != char)):
1723
                        # delete multiple letters only
1724 1
                        dest = (
1725
                            dest[0:j] + char + dest[min(j + 1, term_length) :]
1726 1
                        )
1727 1
                        j += 1
1728 1
1729 1
                    i += 1
1730 1
                    zeta = 0
1731 1
1732
            dest = dest[0:j]
1733 1
1734 1
            return dest
1735
1736 1
        _initialize_phonet(self._lang)
1737
1738
        word = unicode_normalize('NFKC', word)
1739 1
        return _phonet(word, self._mode, self._lang)
1740
1741 1
1742 1
if __name__ == '__main__':
1743
    import doctest
1744 1
1745
    doctest.testmod()
1746