Completed
Pull Request — master (#138)
by Chris
14:20
created

abydos.phonetic._phonet.Phonet.encode()   F

Complexity

Conditions 142

Size

Total Lines 615
Code Lines 381

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 269
CRAP Score 142

Importance

Changes 0
Metric Value
eloc 381
dl 0
loc 615
ccs 269
cts 269
cp 1
rs 0
c 0
b 0
f 0
cc 142
nop 4
crap 142

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like abydos.phonetic._phonet.Phonet.encode() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# -*- coding: utf-8 -*-
0 ignored issues
show
coding-style introduced by
Too many lines in module (1733/1000)
Loading history...
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._phonet.
20
21
The phonetic._phonet module implements phonet algorithm (a.k.a. Hannoveraner
22
Phonetik), intended chiefly for German.
23
"""
24
25 1
from __future__ import unicode_literals
26
27 1
from collections import Counter
28 1
from unicodedata import normalize as unicode_normalize
29
30 1
from six import text_type
31 1
from six.moves import range
32
33 1
from ._phonetic import Phonetic
34
35 1
__all__ = ['Phonet', 'phonet']
36
37
38 1
class Phonet(Phonetic):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
39
    """Phonet code.
40
41
    phonet ("Hannoveraner Phonetik") was developed by Jörg Michael and
42
    documented in :cite:`Michael:1999`.
43
44
    This is a port of Jesper Zedlitz's code, which is licensed LGPL
45
    :cite:`Zedlitz:2015`.
46
47
    That is, in turn, based on Michael's C code, which is also licensed LGPL
48
    :cite:`Michael:2007`.
49
    """
50
51 1
    _rules_no_lang = (  # separator chars
52
        # fmt: off
53
        '´', ' ', ' ',
54
        '"', ' ', ' ',
55
        '`$', '', '',
56
        '\'', ' ', ' ',
57
        ',', ',', ',',
58
        ';', ',', ',',
59
        '-', ' ', ' ',
60
        ' ', ' ', ' ',
61
        '.', '.', '.',
62
        ':', '.', '.',
63
        # German umlauts
64
        'Ä', 'AE', 'AE',
65
        'Ö', 'OE', 'OE',
66
        'Ü', 'UE', 'UE',
67
        'ß', 'S', 'S',
68
        # international umlauts
69
        'À', 'A', 'A',
70
        'Á', 'A', 'A',
71
        'Â', 'A', 'A',
72
        'Ã', 'A', 'A',
73
        'Å', 'A', 'A',
74
        'Æ', 'AE', 'AE',
75
        'Ç', 'C', 'C',
76
        'Ð', 'DJ', 'DJ',
77
        'È', 'E', 'E',
78
        'É', 'E', 'E',
79
        'Ê', 'E', 'E',
80
        'Ë', 'E', 'E',
81
        'Ì', 'I', 'I',
82
        'Í', 'I', 'I',
83
        'Î', 'I', 'I',
84
        'Ï', 'I', 'I',
85
        'Ñ', 'NH', 'NH',
86
        'Ò', 'O', 'O',
87
        'Ó', 'O', 'O',
88
        'Ô', 'O', 'O',
89
        'Õ', 'O', 'O',
90
        'Œ', 'OE', 'OE',
91
        'Ø', 'OE', 'OE',
92
        'Š', 'SH', 'SH',
93
        'Þ', 'TH', 'TH',
94
        'Ù', 'U', 'U',
95
        'Ú', 'U', 'U',
96
        'Û', 'U', 'U',
97
        'Ý', 'Y', 'Y',
98
        'Ÿ', 'Y', 'Y',
99
        # 'normal' letters (A-Z)
100
        'MC^', 'MAC', 'MAC',
101
        'MC^', 'MAC', 'MAC',
102
        'M´^', 'MAC', 'MAC',
103
        'M\'^', 'MAC', 'MAC',
104
        'O´^', 'O', 'O',
105
        'O\'^', 'O', 'O',
106
        'VAN DEN ^', 'VANDEN', 'VANDEN',
107
        None, None, None
108
        # fmt: on
109
    )
110
111 1
    _rules_german = (  # separator chars
112
        # fmt: off
113
        '´', ' ', ' ',
114
        '"', ' ', ' ',
115
        '`$', '', '',
116
        '\'', ' ', ' ',
117
        ',', ' ', ' ',
118
        ';', ' ', ' ',
119
        '-', ' ', ' ',
120
        ' ', ' ', ' ',
121
        '.', '.', '.',
122
        ':', '.', '.',
123
        # German umlauts
124
        'ÄE', 'E', 'E',
125
        'ÄU<', 'EU', 'EU',
126
        'ÄV(AEOU)-<', 'EW', None,
127
        'Ä$', 'Ä', None,
128
        'Ä<', None, 'E',
129
        'Ä', 'E', None,
130
        'ÖE', 'Ö', 'Ö',
131
        'ÖU', 'Ö', 'Ö',
132
        'ÖVER--<', 'ÖW', None,
133
        'ÖV(AOU)-', 'ÖW', None,
134
        'ÜBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
135
        'ÜBER^^', 'ÜBA', 'IBA',
136
        'ÜE', 'Ü', 'I',
137
        'ÜVER--<', 'ÜW', None,
138
        'ÜV(AOU)-', 'ÜW', None,
139
        'Ü', None, 'I',
140
        'ßCH<', None, 'Z',
141
        'ß<', 'S', 'Z',
142
        # international umlauts
143
        'À<', 'A', 'A',
144
        'Á<', 'A', 'A',
145
        'Â<', 'A', 'A',
146
        'Ã<', 'A', 'A',
147
        'Å<', 'A', 'A',
148
        'ÆER-', 'E', 'E',
149
        'ÆU<', 'EU', 'EU',
150
        'ÆV(AEOU)-<', 'EW', None,
151
        'Æ$', 'Ä', None,
152
        'Æ<', None, 'E',
153
        'Æ', 'E', None,
154
        'Ç', 'Z', 'Z',
155
        'ÐÐ-', '', '',
156
        'Ð', 'DI', 'TI',
157
        'È<', 'E', 'E',
158
        'É<', 'E', 'E',
159
        'Ê<', 'E', 'E',
160
        'Ë', 'E', 'E',
161
        'Ì<', 'I', 'I',
162
        'Í<', 'I', 'I',
163
        'Î<', 'I', 'I',
164
        'Ï', 'I', 'I',
165
        'ÑÑ-', '', '',
166
        'Ñ', 'NI', 'NI',
167
        'Ò<', 'O', 'U',
168
        'Ó<', 'O', 'U',
169
        'Ô<', 'O', 'U',
170
        'Õ<', 'O', 'U',
171
        'Œ<', 'Ö', 'Ö',
172
        'Ø(IJY)-<', 'E', 'E',
173
        'Ø<', 'Ö', 'Ö',
174
        'Š', 'SH', 'Z',
175
        'Þ', 'T', 'T',
176
        'Ù<', 'U', 'U',
177
        'Ú<', 'U', 'U',
178
        'Û<', 'U', 'U',
179
        'Ý<', 'I', 'I',
180
        'Ÿ<', 'I', 'I',
181
        # 'normal' letters (A-Z)
182
        'ABELLE$', 'ABL', 'ABL',
183
        'ABELL$', 'ABL', 'ABL',
184
        'ABIENNE$', 'ABIN', 'ABIN',
185
        'ACHME---^', 'ACH', 'AK',
186
        'ACEY$', 'AZI', 'AZI',
187
        'ADV', 'ATW', None,
188
        'AEGL-', 'EK', None,
189
        'AEU<', 'EU', 'EU',
190
        'AE2', 'E', 'E',
191
        'AFTRAUBEN------', 'AFT ', 'AFT ',
192
        'AGL-1', 'AK', None,
193
        'AGNI-^', 'AKN', 'AKN',
194
        'AGNIE-', 'ANI', 'ANI',
195
        'AGN(AEOU)-$', 'ANI', 'ANI',
196
        'AH(AIOÖUÜY)-', 'AH', None,
197
        'AIA2', 'AIA', 'AIA',
198
        'AIE$', 'E', 'E',
199
        'AILL(EOU)-', 'ALI', 'ALI',
200
        'AINE$', 'EN', 'EN',
201
        'AIRE$', 'ER', 'ER',
202
        'AIR-', 'E', 'E',
203
        'AISE$', 'ES', 'EZ',
204
        'AISSANCE$', 'ESANS', 'EZANZ',
205
        'AISSE$', 'ES', 'EZ',
206
        'AIX$', 'EX', 'EX',
207
        'AJ(AÄEÈÉÊIOÖUÜ)--', 'A', 'A',
208
        'AKTIE', 'AXIE', 'AXIE',
209
        'AKTUEL', 'AKTUEL', None,
210
        'ALOI^', 'ALOI', 'ALUI',  # Don't merge these rules
211
        'ALOY^', 'ALOI', 'ALUI',  # needed by 'check_rules'
212
        'AMATEU(RS)-', 'AMATÖ', 'ANATÖ',
213
        'ANCH(OEI)-', 'ANSH', 'ANZ',
214
        'ANDERGEGANG----', 'ANDA GE', 'ANTA KE',
215
        'ANDERGEHE----', 'ANDA ', 'ANTA ',
216
        'ANDERGESETZ----', 'ANDA GE', 'ANTA KE',
217
        'ANDERGING----', 'ANDA ', 'ANTA ',
218
        'ANDERSETZ(ET)-----', 'ANDA ', 'ANTA ',
219
        'ANDERZUGEHE----', 'ANDA ZU ', 'ANTA ZU ',
220
        'ANDERZUSETZE-----', 'ANDA ZU ', 'ANTA ZU ',
221
        'ANER(BKO)---^^', 'AN', None,
222
        'ANHAND---^$', 'AN H', 'AN ',
223
        'ANH(AÄEIOÖUÜY)--^^', 'AN', None,
224
        'ANIELLE$', 'ANIEL', 'ANIL',
225
        'ANIEL', 'ANIEL', None,
226
        'ANSTELLE----^$', 'AN ST', 'AN ZT',
227
        'ANTI^^', 'ANTI', 'ANTI',
228
        'ANVER^^', 'ANFA', 'ANFA',
229
        'ATIA$', 'ATIA', 'ATIA',
230
        'ATIA(NS)--', 'ATI', 'ATI',
231
        'ATI(AÄOÖUÜ)-', 'AZI', 'AZI',
232
        'AUAU--', '', '',
233
        'AUERE$', 'AUERE', None,
234
        'AUERE(NS)-$', 'AUERE', None,
235
        'AUERE(AIOUY)--', 'AUER', None,
236
        'AUER(AÄIOÖUÜY)-', 'AUER', None,
237
        'AUER<', 'AUA', 'AUA',
238
        'AUF^^', 'AUF', 'AUF',
239
        'AULT$', 'O', 'U',
240
        'AUR(BCDFGKLMNQSTVWZ)-', 'AUA', 'AUA',
241
        'AUR$', 'AUA', 'AUA',
242
        'AUSSE$', 'OS', 'UZ',
243
        'AUS(ST)-^', 'AUS', 'AUS',
244
        'AUS^^', 'AUS', 'AUS',
245
        'AUTOFAHR----', 'AUTO ', 'AUTU ',
246
        'AUTO^^', 'AUTO', 'AUTU',
247
        'AUX(IY)-', 'AUX', 'AUX',
248
        'AUX', 'O', 'U',
249
        'AU', 'AU', 'AU',
250
        'AVER--<', 'AW', None,
251
        'AVIER$', 'AWIE', 'AFIE',
252
        'AV(EÈÉÊI)-^', 'AW', None,
253
        'AV(AOU)-', 'AW', None,
254
        'AYRE$', 'EIRE', 'EIRE',
255
        'AYRE(NS)-$', 'EIRE', 'EIRE',
256
        'AYRE(AIOUY)--', 'EIR', 'EIR',
257
        'AYR(AÄIOÖUÜY)-', 'EIR', 'EIR',
258
        'AYR<', 'EIA', 'EIA',
259
        'AYER--<', 'EI', 'EI',
260
        'AY(AÄEIOÖUÜY)--', 'A', 'A',
261
        'AË', 'E', 'E',
262
        'A(IJY)<', 'EI', 'EI',
263
        'BABY^$', 'BEBI', 'BEBI',
264
        'BAB(IY)^', 'BEBI', 'BEBI',
265
        'BEAU^$', 'BO', None,
266
        'BEA(BCMNRU)-^', 'BEA', 'BEA',
267
        'BEAT(AEIMORU)-^', 'BEAT', 'BEAT',
268
        'BEE$', 'BI', 'BI',
269
        'BEIGE^$', 'BESH', 'BEZ',
270
        'BENOIT--', 'BENO', 'BENU',
271
        'BER(DT)-', 'BER', None,
272
        'BERN(DT)-', 'BERN', None,
273
        'BE(LMNRST)-^', 'BE', 'BE',
274
        'BETTE$', 'BET', 'BET',
275
        'BEVOR^$', 'BEFOR', None,
276
        'BIC$', 'BIZ', 'BIZ',
277
        'BOWL(EI)-', 'BOL', 'BUL',
278
        'BP(AÄEÈÉÊIÌÍÎOÖRUÜY)-', 'B', 'B',
279
        'BRINGEND-----^', 'BRI', 'BRI',
280
        'BRINGEND-----', ' BRI', ' BRI',
281
        'BROW(NS)-', 'BRAU', 'BRAU',
282
        'BUDGET7', 'BÜGE', 'BIKE',
283
        'BUFFET7', 'BÜFE', 'BIFE',
284
        'BYLLE$', 'BILE', 'BILE',
285
        'BYLL$', 'BIL', 'BIL',
286
        'BYPA--^', 'BEI', 'BEI',
287
        'BYTE<', 'BEIT', 'BEIT',
288
        'BY9^', 'BÜ', None,
289
        'B(SßZ)$', 'BS', None,
290
        'CACH(EI)-^', 'KESH', 'KEZ',
291
        'CAE--', 'Z', 'Z',
292
        'CA(IY)$', 'ZEI', 'ZEI',
293
        'CE(EIJUY)--', 'Z', 'Z',
294
        'CENT<', 'ZENT', 'ZENT',
295
        'CERST(EI)----^', 'KE', 'KE',
296
        'CER$', 'ZA', 'ZA',
297
        'CE3', 'ZE', 'ZE',
298
        'CH\'S$', 'X', 'X',
299
        'CH´S$', 'X', 'X',
300
        'CHAO(ST)-', 'KAO', 'KAU',
301
        'CHAMPIO-^', 'SHEMPI', 'ZENBI',
302
        'CHAR(AI)-^', 'KAR', 'KAR',
303
        'CHAU(CDFSVWXZ)-', 'SHO', 'ZU',
304
        'CHÄ(CF)-', 'SHE', 'ZE',
305
        'CHE(CF)-', 'SHE', 'ZE',
306
        'CHEM-^', 'KE', 'KE',  # or: 'CHE', 'KE'
307
        'CHEQUE<', 'SHEK', 'ZEK',
308
        'CHI(CFGPVW)-', 'SHI', 'ZI',
309
        'CH(AEUY)-<^', 'SH', 'Z',
310
        'CHK-', '', '',
311
        'CHO(CKPS)-^', 'SHO', 'ZU',
312
        'CHRIS-', 'KRI', None,
313
        'CHRO-', 'KR', None,
314
        'CH(LOR)-<^', 'K', 'K',
315
        'CHST-', 'X', 'X',
316
        'CH(SßXZ)3', 'X', 'X',
317
        'CHTNI-3', 'CHN', 'KN',
318
        'CH^', 'K', 'K',  # or: 'CH', 'K'
319
        'CH', 'CH', 'K',
320
        'CIC$', 'ZIZ', 'ZIZ',
321
        'CIENCEFICT----', 'EIENS ', 'EIENZ ',
322
        'CIENCE$', 'EIENS', 'EIENZ',
323
        'CIER$', 'ZIE', 'ZIE',
324
        'CYB-^', 'ZEI', 'ZEI',
325
        'CY9^', 'ZÜ', 'ZI',
326
        'C(IJY)-<3', 'Z', 'Z',
327
        'CLOWN-', 'KLAU', 'KLAU',
328
        'CCH', 'Z', 'Z',
329
        'CCE-', 'X', 'X',
330
        'C(CK)-', '', '',
331
        'CLAUDET---', 'KLO', 'KLU',
332
        'CLAUDINE^$', 'KLODIN', 'KLUTIN',
333
        'COACH', 'KOSH', 'KUZ',
334
        'COLE$', 'KOL', 'KUL',
335
        'COUCH', 'KAUSH', 'KAUZ',
336
        'COW', 'KAU', 'KAU',
337
        'CQUES$', 'K', 'K',
338
        'CQUE', 'K', 'K',
339
        'CRASH--9', 'KRE', 'KRE',
340
        'CREAT-^', 'KREA', 'KREA',
341
        'CST', 'XT', 'XT',
342
        'CS<^', 'Z', 'Z',
343
        'C(SßX)', 'X', 'X',
344
        'CT\'S$', 'X', 'X',
345
        'CT(SßXZ)', 'X', 'X',
346
        'CZ<', 'Z', 'Z',
347
        'C(ÈÉÊÌÍÎÝ)3', 'Z', 'Z',
348
        'C.^', 'C.', 'C.',
349
        'CÄ-', 'Z', 'Z',
350
        'CÜ$', 'ZÜ', 'ZI',
351
        'C\'S$', 'X', 'X',
352
        'C<', 'K', 'K',
353
        'DAHER^$', 'DAHER', None,
354
        'DARAUFFOLGE-----', 'DARAUF ', 'TARAUF ',
355
        'DAVO(NR)-^$', 'DAFO', 'TAFU',
356
        'DD(SZ)--<', '', '',
357
        'DD9', 'D', None,
358
        'DEPOT7', 'DEPO', 'TEBU',
359
        'DESIGN', 'DISEIN', 'TIZEIN',
360
        'DE(LMNRST)-3^', 'DE', 'TE',
361
        'DETTE$', 'DET', 'TET',
362
        'DH$', 'T', None,
363
        'DIC$', 'DIZ', 'TIZ',
364
        'DIDR-^', 'DIT', None,
365
        'DIEDR-^', 'DIT', None,
366
        'DJ(AEIOU)-^', 'I', 'I',
367
        'DMITR-^', 'DIMIT', 'TINIT',
368
        'DRY9^', 'DRÜ', None,
369
        'DT-', '', '',
370
        'DUIS-^', 'DÜ', 'TI',
371
        'DURCH^^', 'DURCH', 'TURK',
372
        'DVA$', 'TWA', None,
373
        'DY9^', 'DÜ', None,
374
        'DYS$', 'DIS', None,
375
        'DS(CH)--<', 'T', 'T',
376
        'DST', 'ZT', 'ZT',
377
        'DZS(CH)--', 'T', 'T',
378
        'D(SßZ)', 'Z', 'Z',
379
        'D(AÄEIOÖRUÜY)-', 'D', None,
380
        'D(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'D', None,
381
        'D\'H^', 'D', 'T',
382
        'D´H^', 'D', 'T',
383
        'D`H^', 'D', 'T',
384
        'D\'S3$', 'Z', 'Z',
385
        'D´S3$', 'Z', 'Z',
386
        'D^', 'D', None,
387
        'D', 'T', 'T',
388
        'EAULT$', 'O', 'U',
389
        'EAUX$', 'O', 'U',
390
        'EAU', 'O', 'U',
391
        'EAV', 'IW', 'IF',
392
        'EAS3$', 'EAS', None,
393
        'EA(AÄEIOÖÜY)-3', 'EA', 'EA',
394
        'EA3$', 'EA', 'EA',
395
        'EA3', 'I', 'I',
396
        'EBENSO^$', 'EBNSO', 'EBNZU',
397
        'EBENSO^^', 'EBNSO ', 'EBNZU ',
398
        'EBEN^^', 'EBN', 'EBN',
399
        'EE9', 'E', 'E',
400
        'EGL-1', 'EK', None,
401
        'EHE(IUY)--1', 'EH', None,
402
        'EHUNG---1', 'E', None,
403
        'EH(AÄIOÖUÜY)-1', 'EH', None,
404
        'EIEI--', '', '',
405
        'EIERE^$', 'EIERE', None,
406
        'EIERE$', 'EIERE', None,
407
        'EIERE(NS)-$', 'EIERE', None,
408
        'EIERE(AIOUY)--', 'EIER', None,
409
        'EIER(AÄIOÖUÜY)-', 'EIER', None,
410
        'EIER<', 'EIA', None,
411
        'EIGL-1', 'EIK', None,
412
        'EIGH$', 'EI', 'EI',
413
        'EIH--', 'E', 'E',
414
        'EILLE$', 'EI', 'EI',
415
        'EIR(BCDFGKLMNQSTVWZ)-', 'EIA', 'EIA',
416
        'EIR$', 'EIA', 'EIA',
417
        'EITRAUBEN------', 'EIT ', 'EIT ',
418
        'EI', 'EI', 'EI',
419
        'EJ$', 'EI', 'EI',
420
        'ELIZ^', 'ELIS', None,
421
        'ELZ^', 'ELS', None,
422
        'EL-^', 'E', 'E',
423
        'ELANG----1', 'E', 'E',
424
        'EL(DKL)--1', 'E', 'E',
425
        'EL(MNT)--1$', 'E', 'E',
426
        'ELYNE$', 'ELINE', 'ELINE',
427
        'ELYN$', 'ELIN', 'ELIN',
428
        'EL(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'EL', 'EL',
429
        'EL-1', 'L', 'L',
430
        'EM-^', None, 'E',
431
        'EM(DFKMPQT)--1', None, 'E',
432
        'EM(AÄEÈÉÊIÌÍÎOÖUÜY)--1', None, 'E',
433
        'EM-1', None, 'N',
434
        'ENGAG-^', 'ANGA', 'ANKA',
435
        'EN-^', 'E', 'E',
436
        'ENTUEL', 'ENTUEL', None,
437
        'EN(CDGKQSTZ)--1', 'E', 'E',
438
        'EN(AÄEÈÉÊIÌÍÎNOÖUÜY)-1', 'EN', 'EN',
439
        'EN-1', '', '',
440
        'ERH(AÄEIOÖUÜ)-^', 'ERH', 'ER',
441
        'ER-^', 'E', 'E',
442
        'ERREGEND-----', ' ER', ' ER',
443
        'ERT1$', 'AT', None,
444
        'ER(DGLKMNRQTZß)-1', 'ER', None,
445
        'ER(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'ER', 'A',
446
        'ER1$', 'A', 'A',
447
        'ER<1', 'A', 'A',
448
        'ETAT7', 'ETA', 'ETA',
449
        'ETI(AÄOÖÜU)-', 'EZI', 'EZI',
450
        'EUERE$', 'EUERE', None,
451
        'EUERE(NS)-$', 'EUERE', None,
452
        'EUERE(AIOUY)--', 'EUER', None,
453
        'EUER(AÄIOÖUÜY)-', 'EUER', None,
454
        'EUER<', 'EUA', None,
455
        'EUEU--', '', '',
456
        'EUILLE$', 'Ö', 'Ö',
457
        'EUR$', 'ÖR', 'ÖR',
458
        'EUX', 'Ö', 'Ö',
459
        'EUSZ$', 'EUS', None,
460
        'EUTZ$', 'EUS', None,
461
        'EUYS$', 'EUS', 'EUZ',
462
        'EUZ$', 'EUS', None,
463
        'EU', 'EU', 'EU',
464
        'EVER--<1', 'EW', None,
465
        'EV(ÄOÖUÜ)-1', 'EW', None,
466
        'EYER<', 'EIA', 'EIA',
467
        'EY<', 'EI', 'EI',
468
        'FACETTE', 'FASET', 'FAZET',
469
        'FANS--^$', 'FE', 'FE',
470
        'FAN-^$', 'FE', 'FE',
471
        'FAULT-', 'FOL', 'FUL',
472
        'FEE(DL)-', 'FI', 'FI',
473
        'FEHLER', 'FELA', 'FELA',
474
        'FE(LMNRST)-3^', 'FE', 'FE',
475
        'FOERDERN---^', 'FÖRD', 'FÖRT',
476
        'FOERDERN---', ' FÖRD', ' FÖRT',
477
        'FOND7', 'FON', 'FUN',
478
        'FRAIN$', 'FRA', 'FRA',
479
        'FRISEU(RS)-', 'FRISÖ', 'FRIZÖ',
480
        'FY9^', 'FÜ', None,
481
        'FÖRDERN---^', 'FÖRD', 'FÖRT',
482
        'FÖRDERN---', ' FÖRD', ' FÖRT',
483
        'GAGS^$', 'GEX', 'KEX',
484
        'GAG^$', 'GEK', 'KEK',
485
        'GD', 'KT', 'KT',
486
        'GEGEN^^', 'GEGN', 'KEKN',
487
        'GEGENGEKOM-----', 'GEGN ', 'KEKN ',
488
        'GEGENGESET-----', 'GEGN ', 'KEKN ',
489
        'GEGENKOMME-----', 'GEGN ', 'KEKN ',
490
        'GEGENZUKOM---', 'GEGN ZU ', 'KEKN ZU ',
491
        'GENDETWAS-----$', 'GENT ', 'KENT ',
492
        'GENRE', 'IORE', 'IURE',
493
        'GE(LMNRST)-3^', 'GE', 'KE',
494
        'GER(DKT)-', 'GER', None,
495
        'GETTE$', 'GET', 'KET',
496
        'GGF.', 'GF.', None,
497
        'GG-', '', '',
498
        'GH', 'G', None,
499
        'GI(AOU)-^', 'I', 'I',
500
        'GION-3', 'KIO', 'KIU',
501
        'G(CK)-', '', '',
502
        'GJ(AEIOU)-^', 'I', 'I',
503
        'GMBH^$', 'GMBH', 'GMBH',
504
        'GNAC$', 'NIAK', 'NIAK',
505
        'GNON$', 'NION', 'NIUN',
506
        'GN$', 'N', 'N',
507
        'GONCAL-^', 'GONZA', 'KUNZA',
508
        'GRY9^', 'GRÜ', None,
509
        'G(SßXZ)-<', 'K', 'K',
510
        'GUCK-', 'KU', 'KU',
511
        'GUISEP-^', 'IUSE', 'IUZE',
512
        'GUI-^', 'G', 'K',
513
        'GUTAUSSEH------^', 'GUT ', 'KUT ',
514
        'GUTGEHEND------^', 'GUT ', 'KUT ',
515
        'GY9^', 'GÜ', None,
516
        'G(AÄEILOÖRUÜY)-', 'G', None,
517
        'G(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'G', None,
518
        'G\'S$', 'X', 'X',
519
        'G´S$', 'X', 'X',
520
        'G^', 'G', None,
521
        'G', 'K', 'K',
522
        'HA(HIUY)--1', 'H', None,
523
        'HANDVOL---^', 'HANT ', 'ANT ',
524
        'HANNOVE-^', 'HANOF', None,
525
        'HAVEN7$', 'HAFN', None,
526
        'HEAD-', 'HE', 'E',
527
        'HELIEGEN------', 'E ', 'E ',
528
        'HESTEHEN------', 'E ', 'E ',
529
        'HE(LMNRST)-3^', 'HE', 'E',
530
        'HE(LMN)-1', 'E', 'E',
531
        'HEUR1$', 'ÖR', 'ÖR',
532
        'HE(HIUY)--1', 'H', None,
533
        'HIH(AÄEIOÖUÜY)-1', 'IH', None,
534
        'HLH(AÄEIOÖUÜY)-1', 'LH', None,
535
        'HMH(AÄEIOÖUÜY)-1', 'MH', None,
536
        'HNH(AÄEIOÖUÜY)-1', 'NH', None,
537
        'HOBBY9^', 'HOBI', None,
538
        'HOCHBEGAB-----^', 'HOCH ', 'UK ',
539
        'HOCHTALEN-----^', 'HOCH ', 'UK ',
540
        'HOCHZUFRI-----^', 'HOCH ', 'UK ',
541
        'HO(HIY)--1', 'H', None,
542
        'HRH(AÄEIOÖUÜY)-1', 'RH', None,
543
        'HUH(AÄEIOÖUÜY)-1', 'UH', None,
544
        'HUIS^^', 'HÜS', 'IZ',
545
        'HUIS$', 'ÜS', 'IZ',
546
        'HUI--1', 'H', None,
547
        'HYGIEN^', 'HÜKIEN', None,
548
        'HY9^', 'HÜ', None,
549
        'HY(BDGMNPST)-', 'Ü', None,
550
        'H.^', None, 'H.',
551
        'HÄU--1', 'H', None,
552
        'H^', 'H', '',
553
        'H', '', '',
554
        'ICHELL---', 'ISH', 'IZ',
555
        'ICHI$', 'ISHI', 'IZI',
556
        'IEC$', 'IZ', 'IZ',
557
        'IEDENSTELLE------', 'IDN ', 'ITN ',
558
        'IEI-3', '', '',
559
        'IELL3', 'IEL', 'IEL',
560
        'IENNE$', 'IN', 'IN',
561
        'IERRE$', 'IER', 'IER',
562
        'IERZULAN---', 'IR ZU ', 'IR ZU ',
563
        'IETTE$', 'IT', 'IT',
564
        'IEU', 'IÖ', 'IÖ',
565
        'IE<4', 'I', 'I',
566
        'IGL-1', 'IK', None,
567
        'IGHT3$', 'EIT', 'EIT',
568
        'IGNI(EO)-', 'INI', 'INI',
569
        'IGN(AEOU)-$', 'INI', 'INI',
570
        'IHER(DGLKRT)--1', 'IHE', None,
571
        'IHE(IUY)--', 'IH', None,
572
        'IH(AIOÖUÜY)-', 'IH', None,
573
        'IJ(AOU)-', 'I', 'I',
574
        'IJ$', 'I', 'I',
575
        'IJ<', 'EI', 'EI',
576
        'IKOLE$', 'IKOL', 'IKUL',
577
        'ILLAN(STZ)--4', 'ILIA', 'ILIA',
578
        'ILLAR(DT)--4', 'ILIA', 'ILIA',
579
        'IMSTAN----^', 'IM ', 'IN ',
580
        'INDELERREGE------', 'INDL ', 'INTL ',
581
        'INFRAGE-----^$', 'IN ', 'IN ',
582
        'INTERN(AOU)-^', 'INTAN', 'INTAN',
583
        'INVER-', 'INWE', 'INFE',
584
        'ITI(AÄIOÖUÜ)-', 'IZI', 'IZI',
585
        'IUSZ$', 'IUS', None,
586
        'IUTZ$', 'IUS', None,
587
        'IUZ$', 'IUS', None,
588
        'IVER--<', 'IW', None,
589
        'IVIER$', 'IWIE', 'IFIE',
590
        'IV(ÄOÖUÜ)-', 'IW', None,
591
        'IV<3', 'IW', None,
592
        'IY2', 'I', None,
593
        'I(ÈÉÊ)<4', 'I', 'I',
594
        'JAVIE---<^', 'ZA', 'ZA',
595
        'JEANS^$', 'JINS', 'INZ',
596
        'JEANNE^$', 'IAN', 'IAN',
597
        'JEAN-^', 'IA', 'IA',
598
        'JER-^', 'IE', 'IE',
599
        'JE(LMNST)-', 'IE', 'IE',
600
        'JI^', 'JI', None,
601
        'JOR(GK)^$', 'IÖRK', 'IÖRK',
602
        'J', 'I', 'I',
603
        'KC(ÄEIJ)-', 'X', 'X',
604
        'KD', 'KT', None,
605
        'KE(LMNRST)-3^', 'KE', 'KE',
606
        'KG(AÄEILOÖRUÜY)-', 'K', None,
607
        'KH<^', 'K', 'K',
608
        'KIC$', 'KIZ', 'KIZ',
609
        'KLE(LMNRST)-3^', 'KLE', 'KLE',
610
        'KOTELE-^', 'KOTL', 'KUTL',
611
        'KREAT-^', 'KREA', 'KREA',
612
        'KRÜS(TZ)--^', 'KRI', None,
613
        'KRYS(TZ)--^', 'KRI', None,
614
        'KRY9^', 'KRÜ', None,
615
        'KSCH---', 'K', 'K',
616
        'KSH--', 'K', 'K',
617
        'K(SßXZ)7', 'X', 'X',  # implies 'KST' -> 'XT'
618
        'KT\'S$', 'X', 'X',
619
        'KTI(AIOU)-3', 'XI', 'XI',
620
        'KT(SßXZ)', 'X', 'X',
621
        'KY9^', 'KÜ', None,
622
        'K\'S$', 'X', 'X',
623
        'K´S$', 'X', 'X',
624
        'LANGES$', ' LANGES', ' LANKEZ',
625
        'LANGE$', ' LANGE', ' LANKE',
626
        'LANG$', ' LANK', ' LANK',
627
        'LARVE-', 'LARF', 'LARF',
628
        'LD(SßZ)$', 'LS', 'LZ',
629
        'LD\'S$', 'LS', 'LZ',
630
        'LD´S$', 'LS', 'LZ',
631
        'LEAND-^', 'LEAN', 'LEAN',
632
        'LEERSTEHE-----^', 'LER ', 'LER ',
633
        'LEICHBLEIB-----', 'LEICH ', 'LEIK ',
634
        'LEICHLAUTE-----', 'LEICH ', 'LEIK ',
635
        'LEIDERREGE------', 'LEIT ', 'LEIT ',
636
        'LEIDGEPR----^', 'LEIT ', 'LEIT ',
637
        'LEINSTEHE-----', 'LEIN ', 'LEIN ',
638
        'LEL-', 'LE', 'LE',
639
        'LE(MNRST)-3^', 'LE', 'LE',
640
        'LETTE$', 'LET', 'LET',
641
        'LFGNAG-', 'LFGAN', 'LFKAN',
642
        'LICHERWEIS----', 'LICHA ', 'LIKA ',
643
        'LIC$', 'LIZ', 'LIZ',
644
        'LIVE^$', 'LEIF', 'LEIF',
645
        'LT(SßZ)$', 'LS', 'LZ',
646
        'LT\'S$', 'LS', 'LZ',
647
        'LT´S$', 'LS', 'LZ',
648
        'LUI(GS)--', 'LU', 'LU',
649
        'LV(AIO)-', 'LW', None,
650
        'LY9^', 'LÜ', None,
651
        'LSTS$', 'LS', 'LZ',
652
        'LZ(BDFGKLMNPQRSTVWX)-', 'LS', None,
653
        'L(SßZ)$', 'LS', None,
654
        'MAIR-<', 'MEI', 'NEI',
655
        'MANAG-', 'MENE', 'NENE',
656
        'MANUEL', 'MANUEL', None,
657
        'MASSEU(RS)-', 'MASÖ', 'NAZÖ',
658
        'MATCH', 'MESH', 'NEZ',
659
        'MAURICE', 'MORIS', 'NURIZ',
660
        'MBH^$', 'MBH', 'MBH',
661
        'MB(ßZ)$', 'MS', None,
662
        'MB(SßTZ)-', 'M', 'N',
663
        'MCG9^', 'MAK', 'NAK',
664
        'MC9^', 'MAK', 'NAK',
665
        'MEMOIR-^', 'MEMOA', 'NENUA',
666
        'MERHAVEN$', 'MAHAFN', None,
667
        'ME(LMNRST)-3^', 'ME', 'NE',
668
        'MEN(STZ)--3', 'ME', None,
669
        'MEN$', 'MEN', None,
670
        'MIGUEL-', 'MIGE', 'NIKE',
671
        'MIKE^$', 'MEIK', 'NEIK',
672
        'MITHILFE----^$', 'MIT H', 'NIT ',
673
        'MN$', 'M', None,
674
        'MN', 'N', 'N',
675
        'MPJUTE-', 'MPUT', 'NBUT',
676
        'MP(ßZ)$', 'MS', None,
677
        'MP(SßTZ)-', 'M', 'N',
678
        'MP(BDJLMNPQVW)-', 'MB', 'NB',
679
        'MY9^', 'MÜ', None,
680
        'M(ßZ)$', 'MS', None,
681
        'M´G7^', 'MAK', 'NAK',
682
        'M\'G7^', 'MAK', 'NAK',
683
        'M´^', 'MAK', 'NAK',
684
        'M\'^', 'MAK', 'NAK',
685
        'M', None, 'N',
686
        'NACH^^', 'NACH', 'NAK',
687
        'NADINE', 'NADIN', 'NATIN',
688
        'NAIV--', 'NA', 'NA',
689
        'NAISE$', 'NESE', 'NEZE',
690
        'NAUGENOMM------', 'NAU ', 'NAU ',
691
        'NAUSOGUT$', 'NAUSO GUT', 'NAUZU KUT',
692
        'NCH$', 'NSH', 'NZ',
693
        'NCOISE$', 'SOA', 'ZUA',
694
        'NCOIS$', 'SOA', 'ZUA',
695
        'NDAR$', 'NDA', 'NTA',
696
        'NDERINGEN------', 'NDE ', 'NTE ',
697
        'NDRO(CDKTZ)-', 'NTRO', None,
698
        'ND(BFGJLMNPQVW)-', 'NT', None,
699
        'ND(SßZ)$', 'NS', 'NZ',
700
        'ND\'S$', 'NS', 'NZ',
701
        'ND´S$', 'NS', 'NZ',
702
        'NEBEN^^', 'NEBN', 'NEBN',
703
        'NENGELERN------', 'NEN ', 'NEN ',
704
        'NENLERN(ET)---', 'NEN LE', 'NEN LE',
705
        'NENZULERNE---', 'NEN ZU LE', 'NEN ZU LE',
706
        'NE(LMNRST)-3^', 'NE', 'NE',
707
        'NEN-3', 'NE', 'NE',
708
        'NETTE$', 'NET', 'NET',
709
        'NGU^^', 'NU', 'NU',
710
        'NG(BDFJLMNPQRTVW)-', 'NK', 'NK',
711
        'NH(AUO)-$', 'NI', 'NI',
712
        'NICHTSAHNEN-----', 'NIX ', 'NIX ',
713
        'NICHTSSAGE----', 'NIX ', 'NIX ',
714
        'NICHTS^^', 'NIX', 'NIX',
715
        'NICHT^^', 'NICHT', 'NIKT',
716
        'NINE$', 'NIN', 'NIN',
717
        'NON^^', 'NON', 'NUN',
718
        'NOTLEIDE-----^', 'NOT ', 'NUT ',
719
        'NOT^^', 'NOT', 'NUT',
720
        'NTI(AIOU)-3', 'NZI', 'NZI',
721
        'NTIEL--3', 'NZI', 'NZI',
722
        'NT(SßZ)$', 'NS', 'NZ',
723
        'NT\'S$', 'NS', 'NZ',
724
        'NT´S$', 'NS', 'NZ',
725
        'NYLON', 'NEILON', 'NEILUN',
726
        'NY9^', 'NÜ', None,
727
        'NSTZUNEH---', 'NST ZU ', 'NZT ZU ',
728
        'NSZ-', 'NS', None,
729
        'NSTS$', 'NS', 'NZ',
730
        'NZ(BDFGKLMNPQRSTVWX)-', 'NS', None,
731
        'N(SßZ)$', 'NS', None,
732
        'OBERE-', 'OBER', None,
733
        'OBER^^', 'OBA', 'UBA',
734
        'OEU2', 'Ö', 'Ö',
735
        'OE<2', 'Ö', 'Ö',
736
        'OGL-', 'OK', None,
737
        'OGNIE-', 'ONI', 'UNI',
738
        'OGN(AEOU)-$', 'ONI', 'UNI',
739
        'OH(AIOÖUÜY)-', 'OH', None,
740
        'OIE$', 'Ö', 'Ö',
741
        'OIRE$', 'OA', 'UA',
742
        'OIR$', 'OA', 'UA',
743
        'OIX', 'OA', 'UA',
744
        'OI<3', 'EU', 'EU',
745
        'OKAY^$', 'OKE', 'UKE',
746
        'OLYN$', 'OLIN', 'ULIN',
747
        'OO(DLMZ)-', 'U', None,
748
        'OO$', 'U', None,
749
        'OO-', '', '',
750
        'ORGINAL-----', 'ORI', 'URI',
751
        'OTI(AÄOÖUÜ)-', 'OZI', 'UZI',
752
        'OUI^', 'WI', 'FI',
753
        'OUILLE$', 'ULIE', 'ULIE',
754
        'OU(DT)-^', 'AU', 'AU',
755
        'OUSE$', 'AUS', 'AUZ',
756
        'OUT-', 'AU', 'AU',
757
        'OU', 'U', 'U',
758
        'O(FV)$', 'AU', 'AU',  # due to 'OW$' -> 'AU'
759
        'OVER--<', 'OW', None,
760
        'OV(AOU)-', 'OW', None,
761
        'OW$', 'AU', 'AU',
762
        'OWS$', 'OS', 'UZ',
763
        'OJ(AÄEIOÖUÜ)--', 'O', 'U',
764
        'OYER', 'OIA', None,
765
        'OY(AÄEIOÖUÜ)--', 'O', 'U',
766
        'O(JY)<', 'EU', 'EU',
767
        'OZ$', 'OS', None,
768
        'O´^', 'O', 'U',
769
        'O\'^', 'O', 'U',
770
        'O', None, 'U',
771
        'PATIEN--^', 'PAZI', 'PAZI',
772
        'PENSIO-^', 'PANSI', 'PANZI',
773
        'PE(LMNRST)-3^', 'PE', 'PE',
774
        'PFER-^', 'FE', 'FE',
775
        'P(FH)<', 'F', 'F',
776
        'PIC^$', 'PIK', 'PIK',
777
        'PIC$', 'PIZ', 'PIZ',
778
        'PIPELINE', 'PEIBLEIN', 'PEIBLEIN',
779
        'POLYP-', 'POLÜ', None,
780
        'POLY^^', 'POLI', 'PULI',
781
        'PORTRAIT7', 'PORTRE', 'PURTRE',
782
        'POWER7', 'PAUA', 'PAUA',
783
        'PP(FH)--<', 'B', 'B',
784
        'PP-', '', '',
785
        'PRODUZ-^', 'PRODU', 'BRUTU',
786
        'PRODUZI--', ' PRODU', ' BRUTU',
787
        'PRIX^$', 'PRI', 'PRI',
788
        'PS-^^', 'P', None,
789
        'P(SßZ)^', None, 'Z',
790
        'P(SßZ)$', 'BS', None,
791
        'PT-^', '', '',
792
        'PTI(AÄOÖUÜ)-3', 'BZI', 'BZI',
793
        'PY9^', 'PÜ', None,
794
        'P(AÄEIOÖRUÜY)-', 'P', 'P',
795
        'P(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'P', None,
796
        'P.^', None, 'P.',
797
        'P^', 'P', None,
798
        'P', 'B', 'B',
799
        'QI-', 'Z', 'Z',
800
        'QUARANT--', 'KARA', 'KARA',
801
        'QUE(LMNRST)-3', 'KWE', 'KFE',
802
        'QUE$', 'K', 'K',
803
        'QUI(NS)$', 'KI', 'KI',
804
        'QUIZ7', 'KWIS', None,
805
        'Q(UV)7', 'KW', 'KF',
806
        'Q<', 'K', 'K',
807
        'RADFAHR----', 'RAT ', 'RAT ',
808
        'RAEFTEZEHRE-----', 'REFTE ', 'REFTE ',
809
        'RCH', 'RCH', 'RK',
810
        'REA(DU)---3^', 'R', None,
811
        'REBSERZEUG------', 'REBS ', 'REBZ ',
812
        'RECHERCH^', 'RESHASH', 'REZAZ',
813
        'RECYCL--', 'RIZEI', 'RIZEI',
814
        'RE(ALST)-3^', 'RE', None,
815
        'REE$', 'RI', 'RI',
816
        'RER$', 'RA', 'RA',
817
        'RE(MNR)-4', 'RE', 'RE',
818
        'RETTE$', 'RET', 'RET',
819
        'REUZ$', 'REUZ', None,
820
        'REW$', 'RU', 'RU',
821
        'RH<^', 'R', 'R',
822
        'RJA(MN)--', 'RI', 'RI',
823
        'ROWD-^', 'RAU', 'RAU',
824
        'RTEMONNAIE-', 'RTMON', 'RTNUN',
825
        'RTI(AÄOÖUÜ)-3', 'RZI', 'RZI',
826
        'RTIEL--3', 'RZI', 'RZI',
827
        'RV(AEOU)-3', 'RW', None,
828
        'RY(KN)-$', 'RI', 'RI',
829
        'RY9^', 'RÜ', None,
830
        'RÄFTEZEHRE-----', 'REFTE ', 'REFTE ',
831
        'SAISO-^', 'SES', 'ZEZ',
832
        'SAFE^$', 'SEIF', 'ZEIF',
833
        'SAUCE-^', 'SOS', 'ZUZ',
834
        'SCHLAGGEBEN-----<', 'SHLAK ', 'ZLAK ',
835
        'SCHSCH---7', '', '',
836
        'SCHTSCH', 'SH', 'Z',
837
        'SC(HZ)<', 'SH', 'Z',
838
        'SC', 'SK', 'ZK',
839
        'SELBSTST--7^^', 'SELB', 'ZELB',
840
        'SELBST7^^', 'SELBST', 'ZELBZT',
841
        'SERVICE7^', 'SÖRWIS', 'ZÖRFIZ',
842
        'SERVI-^', 'SERW', None,
843
        'SE(LMNRST)-3^', 'SE', 'ZE',
844
        'SETTE$', 'SET', 'ZET',
845
        'SHP-^', 'S', 'Z',
846
        'SHST', 'SHT', 'ZT',
847
        'SHTSH', 'SH', 'Z',
848
        'SHT', 'ST', 'Z',
849
        'SHY9^', 'SHÜ', None,
850
        'SH^^', 'SH', None,
851
        'SH3', 'SH', 'Z',
852
        'SICHERGEGAN-----^', 'SICHA ', 'ZIKA ',
853
        'SICHERGEHE----^', 'SICHA ', 'ZIKA ',
854
        'SICHERGESTEL------^', 'SICHA ', 'ZIKA ',
855
        'SICHERSTELL-----^', 'SICHA ', 'ZIKA ',
856
        'SICHERZU(GS)--^', 'SICHA ZU ', 'ZIKA ZU ',
857
        'SIEGLI-^', 'SIKL', 'ZIKL',
858
        'SIGLI-^', 'SIKL', 'ZIKL',
859
        'SIGHT', 'SEIT', 'ZEIT',
860
        'SIGN', 'SEIN', 'ZEIN',
861
        'SKI(NPZ)-', 'SKI', 'ZKI',
862
        'SKI<^', 'SHI', 'ZI',
863
        'SODASS^$', 'SO DAS', 'ZU TAZ',
864
        'SODAß^$', 'SO DAS', 'ZU TAZ',
865
        'SOGENAN--^', 'SO GEN', 'ZU KEN',
866
        'SOUND-', 'SAUN', 'ZAUN',
867
        'STAATS^^', 'STAZ', 'ZTAZ',
868
        'STADT^^', 'STAT', 'ZTAT',
869
        'STANDE$', ' STANDE', ' ZTANTE',
870
        'START^^', 'START', 'ZTART',
871
        'STAURANT7', 'STORAN', 'ZTURAN',
872
        'STEAK-', 'STE', 'ZTE',
873
        'STEPHEN-^$', 'STEW', None,
874
        'STERN', 'STERN', None,
875
        'STRAF^^', 'STRAF', 'ZTRAF',
876
        'ST\'S$', 'Z', 'Z',
877
        'ST´S$', 'Z', 'Z',
878
        'STST--', '', '',
879
        'STS(ACEÈÉÊHIÌÍÎOUÄÜÖ)--', 'ST', 'ZT',
880
        'ST(SZ)', 'Z', 'Z',
881
        'SPAREN---^', 'SPA', 'ZPA',
882
        'SPAREND----', ' SPA', ' ZPA',
883
        'S(PTW)-^^', 'S', None,
884
        'SP', 'SP', None,
885
        'STYN(AE)-$', 'STIN', 'ZTIN',
886
        'ST', 'ST', 'ZT',
887
        'SUITE<', 'SIUT', 'ZIUT',
888
        'SUKE--$', 'S', 'Z',
889
        'SURF(EI)-', 'SÖRF', 'ZÖRF',
890
        'SV(AEÈÉÊIÌÍÎOU)-<^', 'SW', None,
891
        'SYB(IY)--^', 'SIB', None,
892
        'SYL(KVW)--^', 'SI', None,
893
        'SY9^', 'SÜ', None,
894
        'SZE(NPT)-^', 'ZE', 'ZE',
895
        'SZI(ELN)-^', 'ZI', 'ZI',
896
        'SZCZ<', 'SH', 'Z',
897
        'SZT<', 'ST', 'ZT',
898
        'SZ<3', 'SH', 'Z',
899
        'SÜL(KVW)--^', 'SI', None,
900
        'S', None, 'Z',
901
        'TCH', 'SH', 'Z',
902
        'TD(AÄEIOÖRUÜY)-', 'T', None,
903
        'TD(ÀÁÂÃÅÈÉÊËÌÍÎÏÒÓÔÕØÙÚÛÝŸ)-', 'T', None,
904
        'TEAT-^', 'TEA', 'TEA',
905
        'TERRAI7^', 'TERA', 'TERA',
906
        'TE(LMNRST)-3^', 'TE', 'TE',
907
        'TH<', 'T', 'T',
908
        'TICHT-', 'TIK', 'TIK',
909
        'TICH$', 'TIK', 'TIK',
910
        'TIC$', 'TIZ', 'TIZ',
911
        'TIGGESTELL-------', 'TIK ', 'TIK ',
912
        'TIGSTELL-----', 'TIK ', 'TIK ',
913
        'TOAS-^', 'TO', 'TU',
914
        'TOILET-', 'TOLE', 'TULE',
915
        'TOIN-', 'TOA', 'TUA',
916
        'TRAECHTI-^', 'TRECHT', 'TREKT',
917
        'TRAECHTIG--', ' TRECHT', ' TREKT',
918
        'TRAINI-', 'TREN', 'TREN',
919
        'TRÄCHTI-^', 'TRECHT', 'TREKT',
920
        'TRÄCHTIG--', ' TRECHT', ' TREKT',
921
        'TSCH', 'SH', 'Z',
922
        'TSH', 'SH', 'Z',
923
        'TST', 'ZT', 'ZT',
924
        'T(Sß)', 'Z', 'Z',
925
        'TT(SZ)--<', '', '',
926
        'TT9', 'T', 'T',
927
        'TV^$', 'TV', 'TV',
928
        'TX(AEIOU)-3', 'SH', 'Z',
929
        'TY9^', 'TÜ', None,
930
        'TZ-', '', '',
931
        'T\'S3$', 'Z', 'Z',
932
        'T´S3$', 'Z', 'Z',
933
        'UEBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
934
        'UEBER^^', 'ÜBA', 'IBA',
935
        'UE2', 'Ü', 'I',
936
        'UGL-', 'UK', None,
937
        'UH(AOÖUÜY)-', 'UH', None,
938
        'UIE$', 'Ü', 'I',
939
        'UM^^', 'UM', 'UN',
940
        'UNTERE--3', 'UNTE', 'UNTE',
941
        'UNTER^^', 'UNTA', 'UNTA',
942
        'UNVER^^', 'UNFA', 'UNFA',
943
        'UN^^', 'UN', 'UN',
944
        'UTI(AÄOÖUÜ)-', 'UZI', 'UZI',
945
        'UVE-4', 'UW', None,
946
        'UY2', 'UI', None,
947
        'UZZ', 'AS', 'AZ',
948
        'VACL-^', 'WAZ', 'FAZ',
949
        'VAC$', 'WAZ', 'FAZ',
950
        'VAN DEN ^', 'FANDN', 'FANTN',
951
        'VANES-^', 'WANE', None,
952
        'VATRO-', 'WATR', None,
953
        'VA(DHJNT)--^', 'F', None,
954
        'VEDD-^', 'FE', 'FE',
955
        'VE(BEHIU)--^', 'F', None,
956
        'VEL(BDLMNT)-^', 'FEL', None,
957
        'VENTZ-^', 'FEN', None,
958
        'VEN(NRSZ)-^', 'FEN', None,
959
        'VER(AB)-^$', 'WER', None,
960
        'VERBAL^$', 'WERBAL', None,
961
        'VERBAL(EINS)-^', 'WERBAL', None,
962
        'VERTEBR--', 'WERTE', None,
963
        'VEREIN-----', 'F', None,
964
        'VEREN(AEIOU)-^', 'WEREN', None,
965
        'VERIFI', 'WERIFI', None,
966
        'VERON(AEIOU)-^', 'WERON', None,
967
        'VERSEN^', 'FERSN', 'FAZN',
968
        'VERSIERT--^', 'WERSI', None,
969
        'VERSIO--^', 'WERS', None,
970
        'VERSUS', 'WERSUS', None,
971
        'VERTI(GK)-', 'WERTI', None,
972
        'VER^^', 'FER', 'FA',
973
        'VERSPRECHE-------', ' FER', ' FA',
974
        'VER$', 'WA', None,
975
        'VER', 'FA', 'FA',
976
        'VET(HT)-^', 'FET', 'FET',
977
        'VETTE$', 'WET', 'FET',
978
        'VE^', 'WE', None,
979
        'VIC$', 'WIZ', 'FIZ',
980
        'VIELSAGE----', 'FIL ', 'FIL ',
981
        'VIEL', 'FIL', 'FIL',
982
        'VIEW', 'WIU', 'FIU',
983
        'VILL(AE)-', 'WIL', None,
984
        'VIS(ACEIKUVWZ)-<^', 'WIS', None,
985
        'VI(ELS)--^', 'F', None,
986
        'VILLON--', 'WILI', 'FILI',
987
        'VIZE^^', 'FIZE', 'FIZE',
988
        'VLIE--^', 'FL', None,
989
        'VL(AEIOU)--', 'W', None,
990
        'VOKA-^', 'WOK', None,
991
        'VOL(ATUVW)--^', 'WO', None,
992
        'VOR^^', 'FOR', 'FUR',
993
        'VR(AEIOU)--', 'W', None,
994
        'VV9', 'W', None,
995
        'VY9^', 'WÜ', 'FI',
996
        'V(ÜY)-', 'W', None,
997
        'V(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'W', None,
998
        'V(AEIJLRU)-<', 'W', None,
999
        'V.^', 'V.', None,
1000
        'V<', 'F', 'F',
1001
        'WEITERENTWI-----^', 'WEITA ', 'FEITA ',
1002
        'WEITREICH-----^', 'WEIT ', 'FEIT ',
1003
        'WEITVER^', 'WEIT FER', 'FEIT FA',
1004
        'WE(LMNRST)-3^', 'WE', 'FE',
1005
        'WER(DST)-', 'WER', None,
1006
        'WIC$', 'WIZ', 'FIZ',
1007
        'WIEDERU--', 'WIDE', 'FITE',
1008
        'WIEDER^$', 'WIDA', 'FITA',
1009
        'WIEDER^^', 'WIDA ', 'FITA ',
1010
        'WIEVIEL', 'WI FIL', 'FI FIL',
1011
        'WISUEL', 'WISUEL', None,
1012
        'WR-^', 'W', None,
1013
        'WY9^', 'WÜ', 'FI',
1014
        'W(BDFGJKLMNPQRSTZ)-', 'F', None,
1015
        'W$', 'F', None,
1016
        'W', None, 'F',
1017
        'X<^', 'Z', 'Z',
1018
        'XHAVEN$', 'XAFN', None,
1019
        'X(CSZ)', 'X', 'X',
1020
        'XTS(CH)--', 'XT', 'XT',
1021
        'XT(SZ)', 'Z', 'Z',
1022
        'YE(LMNRST)-3^', 'IE', 'IE',
1023
        'YE-3', 'I', 'I',
1024
        'YOR(GK)^$', 'IÖRK', 'IÖRK',
1025
        'Y(AOU)-<7', 'I', 'I',
1026
        'Y(BKLMNPRSTX)-1', 'Ü', None,
1027
        'YVES^$', 'IF', 'IF',
1028
        'YVONNE^$', 'IWON', 'IFUN',
1029
        'Y.^', 'Y.', None,
1030
        'Y', 'I', 'I',
1031
        'ZC(AOU)-', 'SK', 'ZK',
1032
        'ZE(LMNRST)-3^', 'ZE', 'ZE',
1033
        'ZIEJ$', 'ZI', 'ZI',
1034
        'ZIGERJA(HR)-3', 'ZIGA IA', 'ZIKA IA',
1035
        'ZL(AEIOU)-', 'SL', None,
1036
        'ZS(CHT)--', '', '',
1037
        'ZS', 'SH', 'Z',
1038
        'ZUERST', 'ZUERST', 'ZUERST',
1039
        'ZUGRUNDE^$', 'ZU GRUNDE', 'ZU KRUNTE',
1040
        'ZUGRUNDE', 'ZU GRUNDE ', 'ZU KRUNTE ',
1041
        'ZUGUNSTEN', 'ZU GUNSTN', 'ZU KUNZTN',
1042
        'ZUHAUSE-', 'ZU HAUS', 'ZU AUZ',
1043
        'ZULASTEN^$', 'ZU LASTN', 'ZU LAZTN',
1044
        'ZURUECK^^', 'ZURÜK', 'ZURIK',
1045
        'ZURZEIT', 'ZUR ZEIT', 'ZUR ZEIT',
1046
        'ZURÜCK^^', 'ZURÜK', 'ZURIK',
1047
        'ZUSTANDE', 'ZU STANDE', 'ZU ZTANTE',
1048
        'ZUTAGE', 'ZU TAGE', 'ZU TAKE',
1049
        'ZUVER^^', 'ZUFA', 'ZUFA',
1050
        'ZUVIEL', 'ZU FIL', 'ZU FIL',
1051
        'ZUWENIG', 'ZU WENIK', 'ZU FENIK',
1052
        'ZY9^', 'ZÜ', None,
1053
        'ZYK3$', 'ZIK', None,
1054
        'Z(VW)7^', 'SW', None,
1055
        None, None, None
1056
        # fmt: on
1057
    )
1058
1059 1
    _upper_trans = dict(
1060
        zip(
1061
            (
1062
                ord(_)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
1063
                for _ in 'abcdefghijklmnopqrstuvwxyzàáâãåäæ'
1064
                + 'çðèéêëìíîïñòóôõöøœšßþùúûüýÿ'
1065
            ),
1066
            'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÅÄÆ'
1067
            + 'ÇÐÈÉÊËÌÍÎÏÑÒÓÔÕÖØŒŠßÞÙÚÛÜÝŸ',
1068
        )
1069
    )
1070
1071 1
    def encode(self, word, mode=1, lang='de'):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'encode' method
Loading history...
1072
        """Return the phonet code for a word.
1073
1074
        :param str word: the word to transform
1075
        :param int mode: the ponet variant to employ (1 or 2)
1076
        :param str lang: 'de' (default) for German
1077
                'none' for no language
1078
        :returns: the phonet value
1079
        :rtype: str
1080
1081
        >>> pe = Phonet()
1082
        >>> pe.encode('Christopher')
1083
        'KRISTOFA'
1084
        >>> pe.encode('Niall')
1085
        'NIAL'
1086
        >>> pe.encode('Smith')
1087
        'SMIT'
1088
        >>> pe.encode('Schmidt')
1089
        'SHMIT'
1090
1091
        >>> pe.encode('Christopher', mode=2)
1092
        'KRIZTUFA'
1093
        >>> pe.encode('Niall', mode=2)
1094
        'NIAL'
1095
        >>> pe.encode('Smith', mode=2)
1096
        'ZNIT'
1097
        >>> pe.encode('Schmidt', mode=2)
1098
        'ZNIT'
1099
1100
        >>> pe.encode('Christopher', lang='none')
1101
        'CHRISTOPHER'
1102
        >>> pe.encode('Niall', lang='none')
1103
        'NIAL'
1104
        >>> pe.encode('Smith', lang='none')
1105
        'SMITH'
1106
        >>> pe.encode('Schmidt', lang='none')
1107
        'SCHMIDT'
1108
        """
1109 1
        phonet_hash = Counter()
1110 1
        alpha_pos = Counter()
1111
1112 1
        phonet_hash_1 = Counter()
1113 1
        phonet_hash_2 = Counter()
1114
1115 1
        def _initialize_phonet(lang):
1116
            """Initialize phonet variables."""
1117 1
            if lang == 'none':
1118 1
                _phonet_rules = self._rules_no_lang
1119
            else:
1120 1
                _phonet_rules = self._rules_german
1121
1122 1
            phonet_hash[''] = -1
1123
1124
            # German and international umlauts
1125 1
            for j in {
1126
                'À',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1127
                'Á',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1128
                'Â',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1129
                'Ã',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1130
                'Ä',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1131
                'Å',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1132
                'Æ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1133
                'Ç',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1134
                'È',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1135
                'É',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1136
                'Ê',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1137
                'Ë',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1138
                'Ì',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1139
                'Í',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1140
                'Î',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1141
                'Ï',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1142
                'Ð',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1143
                'Ñ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1144
                'Ò',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1145
                'Ó',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1146
                'Ô',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1147
                'Õ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1148
                'Ö',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1149
                'Ø',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1150
                'Ù',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1151
                'Ú',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1152
                'Û',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1153
                'Ü',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1154
                'Ý',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1155
                'Þ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1156
                'ß',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1157
                'Œ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1158
                'Š',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1159
                'Ÿ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1160
            }:
1161 1
                alpha_pos[j] = 1
1162 1
                phonet_hash[j] = -1
1163
1164
            # "normal" letters ('A'-'Z')
1165 1
            for i, j in enumerate('ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
1166 1
                alpha_pos[j] = i + 2
1167 1
                phonet_hash[j] = -1
1168
1169 1
            for i in range(26):
1170 1
                for j in range(28):
1171 1
                    phonet_hash_1[i, j] = -1
1172 1
                    phonet_hash_2[i, j] = -1
1173
1174
            # for each phonetc rule
1175 1
            for i in range(len(_phonet_rules)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
1176 1
                rule = _phonet_rules[i]
1177
1178 1
                if rule and i % 3 == 0:
1179
                    # calculate first hash value
1180 1
                    k = _phonet_rules[i][0]
1181
1182 1
                    if phonet_hash[k] < 0 and (
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash does not seem to be defined.
Loading history...
1183
                        _phonet_rules[i + 1] or _phonet_rules[i + 2]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1184
                    ):
1185 1
                        phonet_hash[k] = i
1186
1187
                    # calculate second hash values
1188 1
                    if k and alpha_pos[k] >= 2:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable alpha_pos does not seem to be defined.
Loading history...
1189 1
                        k = alpha_pos[k]
1190
1191 1
                        j = k - 2
1192 1
                        rule = rule[1:]
1193
1194 1
                        if not rule:
1195 1
                            rule = ' '
1196 1
                        elif rule[0] == '(':
1197 1
                            rule = rule[1:]
1198
                        else:
1199 1
                            rule = rule[0]
1200
1201 1
                        while rule and (rule[0] != ')'):
1202 1
                            k = alpha_pos[rule[0]]
1203
1204 1
                            if k > 0:
1205
                                # add hash value for this letter
1206 1
                                if phonet_hash_1[j, k] < 0:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash_1 does not seem to be defined.
Loading history...
1207 1
                                    phonet_hash_1[j, k] = i
1208 1
                                    phonet_hash_2[j, k] = i
1209
1210 1
                                if phonet_hash_2[j, k] >= (i - 30):
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash_2 does not seem to be defined.
Loading history...
1211 1
                                    phonet_hash_2[j, k] = i
1212
                                else:
1213 1
                                    k = -1
1214
1215 1
                            if k <= 0:
1216
                                # add hash value for all letters
1217 1
                                if phonet_hash_1[j, 0] < 0:
1218 1
                                    phonet_hash_1[j, 0] = i
1219
1220 1
                                phonet_hash_2[j, 0] = i
1221
1222 1
                            rule = rule[1:]
1223
1224 1
        def _phonet(term, mode, lang):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (29/15).
Loading history...
1225
            """Return the phonet coded form of a term."""
1226 1
            if lang == 'none':
1227 1
                _phonet_rules = self._rules_no_lang
1228
            else:
1229 1
                _phonet_rules = self._rules_german
1230
1231 1
            char0 = ''
1232 1
            dest = term
1233
1234 1
            if not term:
1235 1
                return ''
1236
1237 1
            term_length = len(term)
1238
1239
            # convert input string to upper-case
1240 1
            src = term.translate(self._upper_trans)
1241
1242
            # check "src"
1243 1
            i = 0
1244 1
            j = 0
1245 1
            zeta = 0
1246
1247 1
            while i < len(src):
0 ignored issues
show
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
unused-code introduced by
Too many nested blocks (8/5)
Loading history...
unused-code introduced by
Too many nested blocks (9/5)
Loading history...
unused-code introduced by
Too many nested blocks (7/5)
Loading history...
1248 1
                char = src[i]
1249
1250 1
                pos = alpha_pos[char]
1251
1252 1
                if pos >= 2:
1253 1
                    xpos = pos - 2
1254
1255 1
                    if i + 1 == len(src):
1256 1
                        pos = alpha_pos['']
1257
                    else:
1258 1
                        pos = alpha_pos[src[i + 1]]
1259
1260 1
                    start1 = phonet_hash_1[xpos, pos]
1261 1
                    start2 = phonet_hash_1[xpos, 0]
1262 1
                    end1 = phonet_hash_2[xpos, pos]
1263 1
                    end2 = phonet_hash_2[xpos, 0]
1264
1265
                    # preserve rule priorities
1266 1
                    if (start2 >= 0) and ((start1 < 0) or (start2 < start1)):
1267 1
                        pos = start1
1268 1
                        start1 = start2
1269 1
                        start2 = pos
1270 1
                        pos = end1
1271 1
                        end1 = end2
1272 1
                        end2 = pos
1273
1274 1
                    if (end1 >= start2) and (start2 >= 0):
1275 1
                        if end2 > end1:
1276 1
                            end1 = end2
1277
1278 1
                        start2 = -1
1279 1
                        end2 = -1
1280
                else:
1281 1
                    pos = phonet_hash[char]
1282 1
                    start1 = pos
1283 1
                    end1 = 10000
1284 1
                    start2 = -1
1285 1
                    end2 = -1
1286
1287 1
                pos = start1
1288 1
                zeta0 = 0
1289
1290 1
                if pos >= 0:
1291
                    # check rules for this char
1292 1
                    while (_phonet_rules[pos] is None) or (
1293
                        _phonet_rules[pos][0] == char
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1294
                    ):
1295 1
                        if pos > end1:
1296 1
                            if start2 > 0:
1297 1
                                pos = start2
1298 1
                                start1 = start2
1299 1
                                start2 = -1
1300 1
                                end1 = end2
1301 1
                                end2 = -1
1302 1
                                continue
1303
1304 1
                            break
1305
1306 1
                        if (_phonet_rules[pos] is None) or (
1307
                            _phonet_rules[pos + mode] is None
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1308
                        ):
1309
                            # no conversion rule available
1310 1
                            pos += 3
1311 1
                            continue
1312
1313
                        # check whole string
1314 1
                        matches = 1  # number of matching letters
1315 1
                        priority = 5  # default priority
1316 1
                        rule = _phonet_rules[pos]
1317 1
                        rule = rule[1:]
1318
1319 1
                        while (
1320
                            rule
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1321
                            and (len(src) > (i + matches))
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1322
                            and (src[i + matches] == rule[0])
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1323
                            and not rule[0].isdigit()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1324
                            and (rule not in '(-<^$')
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1325
                        ):
1326 1
                            matches += 1
1327 1
                            rule = rule[1:]
1328
1329 1
                        if rule and (rule[0] == '('):
1330
                            # check an array of letters
1331 1
                            if (
1332
                                (len(src) > (i + matches))
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1333
                                and src[i + matches].isalpha()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1334
                                and (src[i + matches] in rule[1:])
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1335
                            ):
1336 1
                                matches += 1
1337
1338 1
                                while rule and rule[0] != ')':
1339 1
                                    rule = rule[1:]
1340
1341
                                # if rule[0] == ')':
1342 1
                                rule = rule[1:]
1343
1344 1
                        if rule:
1345 1
                            priority0 = ord(rule[0])
1346
                        else:
1347 1
                            priority0 = 0
1348
1349 1
                        matches0 = matches
1350
1351 1
                        while rule and rule[0] == '-' and matches > 1:
1352 1
                            matches -= 1
1353 1
                            rule = rule[1:]
1354
1355 1
                        if rule and rule[0] == '<':
1356 1
                            rule = rule[1:]
1357
1358 1
                        if rule and rule[0].isdigit():
1359
                            # read priority
1360 1
                            priority = int(rule[0])
1361 1
                            rule = rule[1:]
1362
1363 1
                        if rule and rule[0:2] == '^^':
1364 1
                            rule = rule[1:]
1365
1366 1
                        if (
1367
                            not rule
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
best-practice introduced by
Too many boolean expressions in if statement (12/5)
Loading history...
1368
                            or (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1369
                                (rule[0] == '^')
1370
                                and ((i == 0) or not src[i - 1].isalpha())
1371
                                and (
1372
                                    (rule[1:2] != '$')
1373
                                    or (
1374
                                        not (
1375
                                            src[
1376
                                                i + matches0 : i + matches0 + 1
1377
                                            ].isalpha()
1378
                                        )
1379
                                        and (
1380
                                            src[
1381
                                                i + matches0 : i + matches0 + 1
1382
                                            ]
1383
                                            != '.'
1384
                                        )
1385
                                    )
1386
                                )
1387
                            )
1388
                            or (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1389
                                (rule[0] == '$')
1390
                                and (i > 0)
1391
                                and src[i - 1].isalpha()
1392
                                and (
1393
                                    (
1394
                                        not src[
1395
                                            i + matches0 : i + matches0 + 1
1396
                                        ].isalpha()
1397
                                    )
1398
                                    and (
1399
                                        src[i + matches0 : i + matches0 + 1]
1400
                                        != '.'
1401
                                    )
1402
                                )
1403
                            )
1404
                        ):
1405
                            # look for continuation, if:
1406
                            # matches > 1 und NO '-' in first string */
1407 1
                            pos0 = -1
1408
1409 1
                            start3 = 0
1410 1
                            start4 = 0
1411 1
                            end3 = 0
1412 1
                            end4 = 0
1413
1414 1
                            if (
1415
                                (matches > 1)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1416
                                and src[i + matches : i + matches + 1]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1417
                                and (priority0 != ord('-'))
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1418
                            ):
1419 1
                                char0 = src[i + matches - 1]
1420 1
                                pos0 = alpha_pos[char0]
1421
1422 1
                                if pos0 >= 2 and src[i + matches]:
1423 1
                                    xpos = pos0 - 2
1424 1
                                    pos0 = alpha_pos[src[i + matches]]
1425 1
                                    start3 = phonet_hash_1[xpos, pos0]
1426 1
                                    start4 = phonet_hash_1[xpos, 0]
1427 1
                                    end3 = phonet_hash_2[xpos, pos0]
1428 1
                                    end4 = phonet_hash_2[xpos, 0]
1429
1430
                                    # preserve rule priorities
1431 1
                                    if (start4 >= 0) and (
1432
                                        (start3 < 0) or (start4 < start3)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1433
                                    ):
1434 1
                                        pos0 = start3
1435 1
                                        start3 = start4
1436 1
                                        start4 = pos0
1437 1
                                        pos0 = end3
1438 1
                                        end3 = end4
1439 1
                                        end4 = pos0
1440
1441 1
                                    if (end3 >= start4) and (start4 >= 0):
1442 1
                                        if end4 > end3:
1443 1
                                            end3 = end4
1444
1445 1
                                        start4 = -1
1446 1
                                        end4 = -1
1447
                                else:
1448 1
                                    pos0 = phonet_hash[char0]
1449 1
                                    start3 = pos0
1450 1
                                    end3 = 10000
1451 1
                                    start4 = -1
1452 1
                                    end4 = -1
1453
1454 1
                                pos0 = start3
1455
1456
                            # check continuation rules for src[i+matches]
1457 1
                            if pos0 >= 0:
1458 1
                                while (_phonet_rules[pos0] is None) or (
1459
                                    _phonet_rules[pos0][0] == char0
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
introduced by
Value '_phonet_rules[pos0]' is unsubscriptable
Loading history...
1460
                                ):
1461 1
                                    if pos0 > end3:
1462 1
                                        if start4 > 0:
1463 1
                                            pos0 = start4
1464 1
                                            start3 = start4
1465 1
                                            start4 = -1
1466 1
                                            end3 = end4
1467 1
                                            end4 = -1
1468 1
                                            continue
1469
1470 1
                                        priority0 = -1
1471
1472
                                        # important
1473 1
                                        break
1474
1475 1
                                    if (_phonet_rules[pos0] is None) or (
1476
                                        _phonet_rules[pos0 + mode] is None
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1477
                                    ):
1478
                                        # no conversion rule available
1479 1
                                        pos0 += 3
1480 1
                                        continue
1481
1482
                                    # check whole string
1483 1
                                    matches0 = matches
1484 1
                                    priority0 = 5
1485 1
                                    rule = _phonet_rules[pos0]
1486 1
                                    rule = rule[1:]
0 ignored issues
show
introduced by
Value 'rule' is unsubscriptable
Loading history...
1487
1488 1
                                    while (
1489
                                        rule
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1490
                                        and (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1491
                                            src[
1492
                                                i + matches0 : i + matches0 + 1
1493
                                            ]
1494
                                            == rule[0]
1495
                                        )
1496
                                        and (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1497
                                            not rule[0].isdigit()
1498
                                            or (rule in '(-<^$')
1499
                                        )
1500
                                    ):
1501 1
                                        matches0 += 1
1502 1
                                        rule = rule[1:]
1503
1504 1
                                    if rule and rule[0] == '(':
1505
                                        # check an array of letters
1506 1
                                        if src[
1507
                                            i + matches0 : i + matches0 + 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1508
                                        ].isalpha() and (
1509
                                            src[i + matches0] in rule[1:]
1510
                                        ):
1511 1
                                            matches0 += 1
1512
1513 1
                                            while rule and rule[0] != ')':
1514 1
                                                rule = rule[1:]
1515
1516
                                            # if rule[0] == ')':
1517 1
                                            rule = rule[1:]
1518
1519 1
                                    while rule and rule[0] == '-':
1520
                                        # "matches0" is NOT decremented
1521
                                        # because of
1522
                                        #    "if (matches0 == matches)"
1523 1
                                        rule = rule[1:]
1524
1525 1
                                    if rule and rule[0] == '<':
1526 1
                                        rule = rule[1:]
1527
1528 1
                                    if rule and rule[0].isdigit():
1529 1
                                        priority0 = int(rule[0])
1530 1
                                        rule = rule[1:]
1531
1532 1
                                    if (
1533
                                        not rule
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1534
                                        or
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1535
                                        # rule == '^' is not possible here
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1536
                                        (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1537
                                            (rule[0] == '$')
1538
                                            and not src[
1539
                                                i + matches0 : i + matches0 + 1
1540
                                            ].isalpha()
1541
                                            and (
1542
                                                src[
1543
                                                    i
1544
                                                    + matches0 : i
1545
                                                    + matches0
1546
                                                    + 1
1547
                                                ]
1548
                                                != '.'
1549
                                            )
1550
                                        )
1551
                                    ):
1552 1
                                        if matches0 == matches:
1553
                                            # this is only a partial string
1554 1
                                            pos0 += 3
1555 1
                                            continue
1556
1557 1
                                        if priority0 < priority:
1558
                                            # priority is too low
1559 1
                                            pos0 += 3
1560 1
                                            continue
1561
1562
                                        # continuation rule found
1563 1
                                        break
1564
1565 1
                                    pos0 += 3
1566
1567
                                # end of "while"
1568 1
                                if (priority0 >= priority) and (
1569
                                    (_phonet_rules[pos0] is not None)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1570
                                    and (_phonet_rules[pos0][0] == char0)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
introduced by
Value '_phonet_rules[pos0]' is unsubscriptable
Loading history...
1571
                                ):
1572
1573 1
                                    pos += 3
1574 1
                                    continue
1575
1576
                            # replace string
1577 1
                            if _phonet_rules[pos] and (
1578
                                '<' in _phonet_rules[pos][1:]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1579
                            ):
1580 1
                                priority0 = 1
1581
                            else:
1582 1
                                priority0 = 0
1583
1584 1
                            rule = _phonet_rules[pos + mode]
1585
1586 1
                            if (priority0 == 1) and (zeta == 0):
1587
                                # rule with '<' is applied
1588 1
                                if (
1589
                                    (j > 0)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1590
                                    and rule
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1591
                                    and (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1592
                                        (dest[j - 1] == char)
1593
                                        or (dest[j - 1] == rule[0])
1594
                                    )
1595
                                ):
1596 1
                                    j -= 1
1597
1598 1
                                zeta0 = 1
1599 1
                                zeta += 1
1600 1
                                matches0 = 0
1601
1602 1
                                while rule and src[i + matches0]:
1603 1
                                    src = (
1604
                                        src[0 : i + matches0]
1605
                                        + rule[0]
1606
                                        + src[i + matches0 + 1 :]
1607
                                    )
1608 1
                                    matches0 += 1
1609 1
                                    rule = rule[1:]
1610
1611 1
                                if matches0 < matches:
1612 1
                                    src = (
1613
                                        src[0 : i + matches0]
1614
                                        + src[i + matches :]
1615
                                    )
1616
1617 1
                                char = src[i]
1618
                            else:
1619 1
                                i = i + matches - 1
1620 1
                                zeta = 0
1621
1622 1
                                while len(rule) > 1:
1623 1
                                    if (j == 0) or (dest[j - 1] != rule[0]):
1624 1
                                        dest = (
1625
                                            dest[0:j]
1626
                                            + rule[0]
1627
                                            + dest[min(len(dest), j + 1) :]
1628
                                        )
1629 1
                                        j += 1
1630
1631 1
                                    rule = rule[1:]
1632
1633
                                # new "current char"
1634 1
                                if not rule:
1635 1
                                    rule = ''
1636 1
                                    char = ''
1637
                                else:
1638 1
                                    char = rule[0]
1639
1640 1
                                if (
1641
                                    _phonet_rules[pos]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1642
                                    and '^^' in _phonet_rules[pos][1:]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1643
                                ):
1644 1
                                    if char:
1645 1
                                        dest = (
1646
                                            dest[0:j]
1647
                                            + char
1648
                                            + dest[min(len(dest), j + 1) :]
1649
                                        )
1650 1
                                        j += 1
1651
1652 1
                                    src = src[i + 1 :]
1653 1
                                    i = 0
1654 1
                                    zeta0 = 1
1655
1656 1
                            break
1657
1658 1
                        pos += 3
1659
1660 1
                        if pos > end1 and start2 > 0:
1661 1
                            pos = start2
1662 1
                            start1 = start2
1663 1
                            end1 = end2
1664 1
                            start2 = -1
1665 1
                            end2 = -1
1666
1667 1
                if zeta0 == 0:
1668 1
                    if char and ((j == 0) or (dest[j - 1] != char)):
1669
                        # delete multiple letters only
1670 1
                        dest = (
1671
                            dest[0:j] + char + dest[min(j + 1, term_length) :]
1672
                        )
1673 1
                        j += 1
1674
1675 1
                    i += 1
1676 1
                    zeta = 0
1677
1678 1
            dest = dest[0:j]
1679
1680 1
            return dest
1681
1682 1
        _initialize_phonet(lang)
1683
1684 1
        word = unicode_normalize('NFKC', text_type(word))
1685 1
        return _phonet(word, mode, lang)
1686
1687
1688 1
def phonet(word, mode=1, lang='de'):
1689
    """Return the phonet code for a word.
1690
1691
    This is a wrapper for :py:meth:`Phonet.encode`.
1692
1693
    :param str word: the word to transform
1694
    :param int mode: the ponet variant to employ (1 or 2)
1695
    :param str lang: 'de' (default) for German
1696
            'none' for no language
1697
    :returns: the phonet value
1698
    :rtype: str
1699
1700
    >>> phonet('Christopher')
1701
    'KRISTOFA'
1702
    >>> phonet('Niall')
1703
    'NIAL'
1704
    >>> phonet('Smith')
1705
    'SMIT'
1706
    >>> phonet('Schmidt')
1707
    'SHMIT'
1708
1709
    >>> phonet('Christopher', mode=2)
1710
    'KRIZTUFA'
1711
    >>> phonet('Niall', mode=2)
1712
    'NIAL'
1713
    >>> phonet('Smith', mode=2)
1714
    'ZNIT'
1715
    >>> phonet('Schmidt', mode=2)
1716
    'ZNIT'
1717
1718
    >>> phonet('Christopher', lang='none')
1719
    'CHRISTOPHER'
1720
    >>> phonet('Niall', lang='none')
1721
    'NIAL'
1722
    >>> phonet('Smith', lang='none')
1723
    'SMITH'
1724
    >>> phonet('Schmidt', lang='none')
1725
    'SCHMIDT'
1726
    """
1727 1
    return Phonet().encode(word, mode, lang)
1728
1729
1730
if __name__ == '__main__':
1731
    import doctest
1732
1733
    doctest.testmod()
1734