Completed
Pull Request — master (#141)
by Chris
11:04
created

abydos.phonetic._phonet.Phonet.encode()   F

Complexity

Conditions 142

Size

Total Lines 633
Code Lines 381

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 269
CRAP Score 142

Importance

Changes 0
Metric Value
eloc 381
dl 0
loc 633
ccs 269
cts 269
cp 1
rs 0
c 0
b 0
f 0
cc 142
nop 4
crap 142

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like abydos.phonetic._phonet.Phonet.encode() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# -*- coding: utf-8 -*-
0 ignored issues
show
coding-style introduced by
Too many lines in module (1758/1000)
Loading history...
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._phonet.
20
21
phonet algorithm (a.k.a. Hannoveraner Phonetik), intended chiefly for German
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from collections import Counter
32 1
from unicodedata import normalize as unicode_normalize
33
34 1
from six import text_type
35 1
from six.moves import range
36
37 1
from ._phonetic import Phonetic
38
39 1
__all__ = ['Phonet', 'phonet']
40
41
42 1
class Phonet(Phonetic):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
43
    """Phonet code.
44
45
    phonet ("Hannoveraner Phonetik") was developed by Jörg Michael and
46
    documented in :cite:`Michael:1999`.
47
48
    This is a port of Jesper Zedlitz's code, which is licensed LGPL
49
    :cite:`Zedlitz:2015`.
50
51
    That is, in turn, based on Michael's C code, which is also licensed LGPL
52
    :cite:`Michael:2007`.
53
    """
54
55 1
    _rules_no_lang = (  # separator chars
56
        # fmt: off
57
        '´', ' ', ' ',
58
        '"', ' ', ' ',
59
        '`$', '', '',
60
        '\'', ' ', ' ',
61
        ',', ',', ',',
62
        ';', ',', ',',
63
        '-', ' ', ' ',
64
        ' ', ' ', ' ',
65
        '.', '.', '.',
66
        ':', '.', '.',
67
        # German umlauts
68
        'Ä', 'AE', 'AE',
69
        'Ö', 'OE', 'OE',
70
        'Ü', 'UE', 'UE',
71
        'ß', 'S', 'S',
72
        # international umlauts
73
        'À', 'A', 'A',
74
        'Á', 'A', 'A',
75
        'Â', 'A', 'A',
76
        'Ã', 'A', 'A',
77
        'Å', 'A', 'A',
78
        'Æ', 'AE', 'AE',
79
        'Ç', 'C', 'C',
80
        'Ð', 'DJ', 'DJ',
81
        'È', 'E', 'E',
82
        'É', 'E', 'E',
83
        'Ê', 'E', 'E',
84
        'Ë', 'E', 'E',
85
        'Ì', 'I', 'I',
86
        'Í', 'I', 'I',
87
        'Î', 'I', 'I',
88
        'Ï', 'I', 'I',
89
        'Ñ', 'NH', 'NH',
90
        'Ò', 'O', 'O',
91
        'Ó', 'O', 'O',
92
        'Ô', 'O', 'O',
93
        'Õ', 'O', 'O',
94
        'Œ', 'OE', 'OE',
95
        'Ø', 'OE', 'OE',
96
        'Š', 'SH', 'SH',
97
        'Þ', 'TH', 'TH',
98
        'Ù', 'U', 'U',
99
        'Ú', 'U', 'U',
100
        'Û', 'U', 'U',
101
        'Ý', 'Y', 'Y',
102
        'Ÿ', 'Y', 'Y',
103
        # 'normal' letters (A-Z)
104
        'MC^', 'MAC', 'MAC',
105
        'MC^', 'MAC', 'MAC',
106
        'M´^', 'MAC', 'MAC',
107
        'M\'^', 'MAC', 'MAC',
108
        'O´^', 'O', 'O',
109
        'O\'^', 'O', 'O',
110
        'VAN DEN ^', 'VANDEN', 'VANDEN',
111
        None, None, None
112
        # fmt: on
113
    )
114
115 1
    _rules_german = (  # separator chars
116
        # fmt: off
117
        '´', ' ', ' ',
118
        '"', ' ', ' ',
119
        '`$', '', '',
120
        '\'', ' ', ' ',
121
        ',', ' ', ' ',
122
        ';', ' ', ' ',
123
        '-', ' ', ' ',
124
        ' ', ' ', ' ',
125
        '.', '.', '.',
126
        ':', '.', '.',
127
        # German umlauts
128
        'ÄE', 'E', 'E',
129
        'ÄU<', 'EU', 'EU',
130
        'ÄV(AEOU)-<', 'EW', None,
131
        'Ä$', 'Ä', None,
132
        'Ä<', None, 'E',
133
        'Ä', 'E', None,
134
        'ÖE', 'Ö', 'Ö',
135
        'ÖU', 'Ö', 'Ö',
136
        'ÖVER--<', 'ÖW', None,
137
        'ÖV(AOU)-', 'ÖW', None,
138
        'ÜBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
139
        'ÜBER^^', 'ÜBA', 'IBA',
140
        'ÜE', 'Ü', 'I',
141
        'ÜVER--<', 'ÜW', None,
142
        'ÜV(AOU)-', 'ÜW', None,
143
        'Ü', None, 'I',
144
        'ßCH<', None, 'Z',
145
        'ß<', 'S', 'Z',
146
        # international umlauts
147
        'À<', 'A', 'A',
148
        'Á<', 'A', 'A',
149
        'Â<', 'A', 'A',
150
        'Ã<', 'A', 'A',
151
        'Å<', 'A', 'A',
152
        'ÆER-', 'E', 'E',
153
        'ÆU<', 'EU', 'EU',
154
        'ÆV(AEOU)-<', 'EW', None,
155
        'Æ$', 'Ä', None,
156
        'Æ<', None, 'E',
157
        'Æ', 'E', None,
158
        'Ç', 'Z', 'Z',
159
        'ÐÐ-', '', '',
160
        'Ð', 'DI', 'TI',
161
        'È<', 'E', 'E',
162
        'É<', 'E', 'E',
163
        'Ê<', 'E', 'E',
164
        'Ë', 'E', 'E',
165
        'Ì<', 'I', 'I',
166
        'Í<', 'I', 'I',
167
        'Î<', 'I', 'I',
168
        'Ï', 'I', 'I',
169
        'ÑÑ-', '', '',
170
        'Ñ', 'NI', 'NI',
171
        'Ò<', 'O', 'U',
172
        'Ó<', 'O', 'U',
173
        'Ô<', 'O', 'U',
174
        'Õ<', 'O', 'U',
175
        'Œ<', 'Ö', 'Ö',
176
        'Ø(IJY)-<', 'E', 'E',
177
        'Ø<', 'Ö', 'Ö',
178
        'Š', 'SH', 'Z',
179
        'Þ', 'T', 'T',
180
        'Ù<', 'U', 'U',
181
        'Ú<', 'U', 'U',
182
        'Û<', 'U', 'U',
183
        'Ý<', 'I', 'I',
184
        'Ÿ<', 'I', 'I',
185
        # 'normal' letters (A-Z)
186
        'ABELLE$', 'ABL', 'ABL',
187
        'ABELL$', 'ABL', 'ABL',
188
        'ABIENNE$', 'ABIN', 'ABIN',
189
        'ACHME---^', 'ACH', 'AK',
190
        'ACEY$', 'AZI', 'AZI',
191
        'ADV', 'ATW', None,
192
        'AEGL-', 'EK', None,
193
        'AEU<', 'EU', 'EU',
194
        'AE2', 'E', 'E',
195
        'AFTRAUBEN------', 'AFT ', 'AFT ',
196
        'AGL-1', 'AK', None,
197
        'AGNI-^', 'AKN', 'AKN',
198
        'AGNIE-', 'ANI', 'ANI',
199
        'AGN(AEOU)-$', 'ANI', 'ANI',
200
        'AH(AIOÖUÜY)-', 'AH', None,
201
        'AIA2', 'AIA', 'AIA',
202
        'AIE$', 'E', 'E',
203
        'AILL(EOU)-', 'ALI', 'ALI',
204
        'AINE$', 'EN', 'EN',
205
        'AIRE$', 'ER', 'ER',
206
        'AIR-', 'E', 'E',
207
        'AISE$', 'ES', 'EZ',
208
        'AISSANCE$', 'ESANS', 'EZANZ',
209
        'AISSE$', 'ES', 'EZ',
210
        'AIX$', 'EX', 'EX',
211
        'AJ(AÄEÈÉÊIOÖUÜ)--', 'A', 'A',
212
        'AKTIE', 'AXIE', 'AXIE',
213
        'AKTUEL', 'AKTUEL', None,
214
        'ALOI^', 'ALOI', 'ALUI',  # Don't merge these rules
215
        'ALOY^', 'ALOI', 'ALUI',  # needed by 'check_rules'
216
        'AMATEU(RS)-', 'AMATÖ', 'ANATÖ',
217
        'ANCH(OEI)-', 'ANSH', 'ANZ',
218
        'ANDERGEGANG----', 'ANDA GE', 'ANTA KE',
219
        'ANDERGEHE----', 'ANDA ', 'ANTA ',
220
        'ANDERGESETZ----', 'ANDA GE', 'ANTA KE',
221
        'ANDERGING----', 'ANDA ', 'ANTA ',
222
        'ANDERSETZ(ET)-----', 'ANDA ', 'ANTA ',
223
        'ANDERZUGEHE----', 'ANDA ZU ', 'ANTA ZU ',
224
        'ANDERZUSETZE-----', 'ANDA ZU ', 'ANTA ZU ',
225
        'ANER(BKO)---^^', 'AN', None,
226
        'ANHAND---^$', 'AN H', 'AN ',
227
        'ANH(AÄEIOÖUÜY)--^^', 'AN', None,
228
        'ANIELLE$', 'ANIEL', 'ANIL',
229
        'ANIEL', 'ANIEL', None,
230
        'ANSTELLE----^$', 'AN ST', 'AN ZT',
231
        'ANTI^^', 'ANTI', 'ANTI',
232
        'ANVER^^', 'ANFA', 'ANFA',
233
        'ATIA$', 'ATIA', 'ATIA',
234
        'ATIA(NS)--', 'ATI', 'ATI',
235
        'ATI(AÄOÖUÜ)-', 'AZI', 'AZI',
236
        'AUAU--', '', '',
237
        'AUERE$', 'AUERE', None,
238
        'AUERE(NS)-$', 'AUERE', None,
239
        'AUERE(AIOUY)--', 'AUER', None,
240
        'AUER(AÄIOÖUÜY)-', 'AUER', None,
241
        'AUER<', 'AUA', 'AUA',
242
        'AUF^^', 'AUF', 'AUF',
243
        'AULT$', 'O', 'U',
244
        'AUR(BCDFGKLMNQSTVWZ)-', 'AUA', 'AUA',
245
        'AUR$', 'AUA', 'AUA',
246
        'AUSSE$', 'OS', 'UZ',
247
        'AUS(ST)-^', 'AUS', 'AUS',
248
        'AUS^^', 'AUS', 'AUS',
249
        'AUTOFAHR----', 'AUTO ', 'AUTU ',
250
        'AUTO^^', 'AUTO', 'AUTU',
251
        'AUX(IY)-', 'AUX', 'AUX',
252
        'AUX', 'O', 'U',
253
        'AU', 'AU', 'AU',
254
        'AVER--<', 'AW', None,
255
        'AVIER$', 'AWIE', 'AFIE',
256
        'AV(EÈÉÊI)-^', 'AW', None,
257
        'AV(AOU)-', 'AW', None,
258
        'AYRE$', 'EIRE', 'EIRE',
259
        'AYRE(NS)-$', 'EIRE', 'EIRE',
260
        'AYRE(AIOUY)--', 'EIR', 'EIR',
261
        'AYR(AÄIOÖUÜY)-', 'EIR', 'EIR',
262
        'AYR<', 'EIA', 'EIA',
263
        'AYER--<', 'EI', 'EI',
264
        'AY(AÄEIOÖUÜY)--', 'A', 'A',
265
        'AË', 'E', 'E',
266
        'A(IJY)<', 'EI', 'EI',
267
        'BABY^$', 'BEBI', 'BEBI',
268
        'BAB(IY)^', 'BEBI', 'BEBI',
269
        'BEAU^$', 'BO', None,
270
        'BEA(BCMNRU)-^', 'BEA', 'BEA',
271
        'BEAT(AEIMORU)-^', 'BEAT', 'BEAT',
272
        'BEE$', 'BI', 'BI',
273
        'BEIGE^$', 'BESH', 'BEZ',
274
        'BENOIT--', 'BENO', 'BENU',
275
        'BER(DT)-', 'BER', None,
276
        'BERN(DT)-', 'BERN', None,
277
        'BE(LMNRST)-^', 'BE', 'BE',
278
        'BETTE$', 'BET', 'BET',
279
        'BEVOR^$', 'BEFOR', None,
280
        'BIC$', 'BIZ', 'BIZ',
281
        'BOWL(EI)-', 'BOL', 'BUL',
282
        'BP(AÄEÈÉÊIÌÍÎOÖRUÜY)-', 'B', 'B',
283
        'BRINGEND-----^', 'BRI', 'BRI',
284
        'BRINGEND-----', ' BRI', ' BRI',
285
        'BROW(NS)-', 'BRAU', 'BRAU',
286
        'BUDGET7', 'BÜGE', 'BIKE',
287
        'BUFFET7', 'BÜFE', 'BIFE',
288
        'BYLLE$', 'BILE', 'BILE',
289
        'BYLL$', 'BIL', 'BIL',
290
        'BYPA--^', 'BEI', 'BEI',
291
        'BYTE<', 'BEIT', 'BEIT',
292
        'BY9^', 'BÜ', None,
293
        'B(SßZ)$', 'BS', None,
294
        'CACH(EI)-^', 'KESH', 'KEZ',
295
        'CAE--', 'Z', 'Z',
296
        'CA(IY)$', 'ZEI', 'ZEI',
297
        'CE(EIJUY)--', 'Z', 'Z',
298
        'CENT<', 'ZENT', 'ZENT',
299
        'CERST(EI)----^', 'KE', 'KE',
300
        'CER$', 'ZA', 'ZA',
301
        'CE3', 'ZE', 'ZE',
302
        'CH\'S$', 'X', 'X',
303
        'CH´S$', 'X', 'X',
304
        'CHAO(ST)-', 'KAO', 'KAU',
305
        'CHAMPIO-^', 'SHEMPI', 'ZENBI',
306
        'CHAR(AI)-^', 'KAR', 'KAR',
307
        'CHAU(CDFSVWXZ)-', 'SHO', 'ZU',
308
        'CHÄ(CF)-', 'SHE', 'ZE',
309
        'CHE(CF)-', 'SHE', 'ZE',
310
        'CHEM-^', 'KE', 'KE',  # or: 'CHE', 'KE'
311
        'CHEQUE<', 'SHEK', 'ZEK',
312
        'CHI(CFGPVW)-', 'SHI', 'ZI',
313
        'CH(AEUY)-<^', 'SH', 'Z',
314
        'CHK-', '', '',
315
        'CHO(CKPS)-^', 'SHO', 'ZU',
316
        'CHRIS-', 'KRI', None,
317
        'CHRO-', 'KR', None,
318
        'CH(LOR)-<^', 'K', 'K',
319
        'CHST-', 'X', 'X',
320
        'CH(SßXZ)3', 'X', 'X',
321
        'CHTNI-3', 'CHN', 'KN',
322
        'CH^', 'K', 'K',  # or: 'CH', 'K'
323
        'CH', 'CH', 'K',
324
        'CIC$', 'ZIZ', 'ZIZ',
325
        'CIENCEFICT----', 'EIENS ', 'EIENZ ',
326
        'CIENCE$', 'EIENS', 'EIENZ',
327
        'CIER$', 'ZIE', 'ZIE',
328
        'CYB-^', 'ZEI', 'ZEI',
329
        'CY9^', 'ZÜ', 'ZI',
330
        'C(IJY)-<3', 'Z', 'Z',
331
        'CLOWN-', 'KLAU', 'KLAU',
332
        'CCH', 'Z', 'Z',
333
        'CCE-', 'X', 'X',
334
        'C(CK)-', '', '',
335
        'CLAUDET---', 'KLO', 'KLU',
336
        'CLAUDINE^$', 'KLODIN', 'KLUTIN',
337
        'COACH', 'KOSH', 'KUZ',
338
        'COLE$', 'KOL', 'KUL',
339
        'COUCH', 'KAUSH', 'KAUZ',
340
        'COW', 'KAU', 'KAU',
341
        'CQUES$', 'K', 'K',
342
        'CQUE', 'K', 'K',
343
        'CRASH--9', 'KRE', 'KRE',
344
        'CREAT-^', 'KREA', 'KREA',
345
        'CST', 'XT', 'XT',
346
        'CS<^', 'Z', 'Z',
347
        'C(SßX)', 'X', 'X',
348
        'CT\'S$', 'X', 'X',
349
        'CT(SßXZ)', 'X', 'X',
350
        'CZ<', 'Z', 'Z',
351
        'C(ÈÉÊÌÍÎÝ)3', 'Z', 'Z',
352
        'C.^', 'C.', 'C.',
353
        'CÄ-', 'Z', 'Z',
354
        'CÜ$', 'ZÜ', 'ZI',
355
        'C\'S$', 'X', 'X',
356
        'C<', 'K', 'K',
357
        'DAHER^$', 'DAHER', None,
358
        'DARAUFFOLGE-----', 'DARAUF ', 'TARAUF ',
359
        'DAVO(NR)-^$', 'DAFO', 'TAFU',
360
        'DD(SZ)--<', '', '',
361
        'DD9', 'D', None,
362
        'DEPOT7', 'DEPO', 'TEBU',
363
        'DESIGN', 'DISEIN', 'TIZEIN',
364
        'DE(LMNRST)-3^', 'DE', 'TE',
365
        'DETTE$', 'DET', 'TET',
366
        'DH$', 'T', None,
367
        'DIC$', 'DIZ', 'TIZ',
368
        'DIDR-^', 'DIT', None,
369
        'DIEDR-^', 'DIT', None,
370
        'DJ(AEIOU)-^', 'I', 'I',
371
        'DMITR-^', 'DIMIT', 'TINIT',
372
        'DRY9^', 'DRÜ', None,
373
        'DT-', '', '',
374
        'DUIS-^', 'DÜ', 'TI',
375
        'DURCH^^', 'DURCH', 'TURK',
376
        'DVA$', 'TWA', None,
377
        'DY9^', 'DÜ', None,
378
        'DYS$', 'DIS', None,
379
        'DS(CH)--<', 'T', 'T',
380
        'DST', 'ZT', 'ZT',
381
        'DZS(CH)--', 'T', 'T',
382
        'D(SßZ)', 'Z', 'Z',
383
        'D(AÄEIOÖRUÜY)-', 'D', None,
384
        'D(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'D', None,
385
        'D\'H^', 'D', 'T',
386
        'D´H^', 'D', 'T',
387
        'D`H^', 'D', 'T',
388
        'D\'S3$', 'Z', 'Z',
389
        'D´S3$', 'Z', 'Z',
390
        'D^', 'D', None,
391
        'D', 'T', 'T',
392
        'EAULT$', 'O', 'U',
393
        'EAUX$', 'O', 'U',
394
        'EAU', 'O', 'U',
395
        'EAV', 'IW', 'IF',
396
        'EAS3$', 'EAS', None,
397
        'EA(AÄEIOÖÜY)-3', 'EA', 'EA',
398
        'EA3$', 'EA', 'EA',
399
        'EA3', 'I', 'I',
400
        'EBENSO^$', 'EBNSO', 'EBNZU',
401
        'EBENSO^^', 'EBNSO ', 'EBNZU ',
402
        'EBEN^^', 'EBN', 'EBN',
403
        'EE9', 'E', 'E',
404
        'EGL-1', 'EK', None,
405
        'EHE(IUY)--1', 'EH', None,
406
        'EHUNG---1', 'E', None,
407
        'EH(AÄIOÖUÜY)-1', 'EH', None,
408
        'EIEI--', '', '',
409
        'EIERE^$', 'EIERE', None,
410
        'EIERE$', 'EIERE', None,
411
        'EIERE(NS)-$', 'EIERE', None,
412
        'EIERE(AIOUY)--', 'EIER', None,
413
        'EIER(AÄIOÖUÜY)-', 'EIER', None,
414
        'EIER<', 'EIA', None,
415
        'EIGL-1', 'EIK', None,
416
        'EIGH$', 'EI', 'EI',
417
        'EIH--', 'E', 'E',
418
        'EILLE$', 'EI', 'EI',
419
        'EIR(BCDFGKLMNQSTVWZ)-', 'EIA', 'EIA',
420
        'EIR$', 'EIA', 'EIA',
421
        'EITRAUBEN------', 'EIT ', 'EIT ',
422
        'EI', 'EI', 'EI',
423
        'EJ$', 'EI', 'EI',
424
        'ELIZ^', 'ELIS', None,
425
        'ELZ^', 'ELS', None,
426
        'EL-^', 'E', 'E',
427
        'ELANG----1', 'E', 'E',
428
        'EL(DKL)--1', 'E', 'E',
429
        'EL(MNT)--1$', 'E', 'E',
430
        'ELYNE$', 'ELINE', 'ELINE',
431
        'ELYN$', 'ELIN', 'ELIN',
432
        'EL(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'EL', 'EL',
433
        'EL-1', 'L', 'L',
434
        'EM-^', None, 'E',
435
        'EM(DFKMPQT)--1', None, 'E',
436
        'EM(AÄEÈÉÊIÌÍÎOÖUÜY)--1', None, 'E',
437
        'EM-1', None, 'N',
438
        'ENGAG-^', 'ANGA', 'ANKA',
439
        'EN-^', 'E', 'E',
440
        'ENTUEL', 'ENTUEL', None,
441
        'EN(CDGKQSTZ)--1', 'E', 'E',
442
        'EN(AÄEÈÉÊIÌÍÎNOÖUÜY)-1', 'EN', 'EN',
443
        'EN-1', '', '',
444
        'ERH(AÄEIOÖUÜ)-^', 'ERH', 'ER',
445
        'ER-^', 'E', 'E',
446
        'ERREGEND-----', ' ER', ' ER',
447
        'ERT1$', 'AT', None,
448
        'ER(DGLKMNRQTZß)-1', 'ER', None,
449
        'ER(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'ER', 'A',
450
        'ER1$', 'A', 'A',
451
        'ER<1', 'A', 'A',
452
        'ETAT7', 'ETA', 'ETA',
453
        'ETI(AÄOÖÜU)-', 'EZI', 'EZI',
454
        'EUERE$', 'EUERE', None,
455
        'EUERE(NS)-$', 'EUERE', None,
456
        'EUERE(AIOUY)--', 'EUER', None,
457
        'EUER(AÄIOÖUÜY)-', 'EUER', None,
458
        'EUER<', 'EUA', None,
459
        'EUEU--', '', '',
460
        'EUILLE$', 'Ö', 'Ö',
461
        'EUR$', 'ÖR', 'ÖR',
462
        'EUX', 'Ö', 'Ö',
463
        'EUSZ$', 'EUS', None,
464
        'EUTZ$', 'EUS', None,
465
        'EUYS$', 'EUS', 'EUZ',
466
        'EUZ$', 'EUS', None,
467
        'EU', 'EU', 'EU',
468
        'EVER--<1', 'EW', None,
469
        'EV(ÄOÖUÜ)-1', 'EW', None,
470
        'EYER<', 'EIA', 'EIA',
471
        'EY<', 'EI', 'EI',
472
        'FACETTE', 'FASET', 'FAZET',
473
        'FANS--^$', 'FE', 'FE',
474
        'FAN-^$', 'FE', 'FE',
475
        'FAULT-', 'FOL', 'FUL',
476
        'FEE(DL)-', 'FI', 'FI',
477
        'FEHLER', 'FELA', 'FELA',
478
        'FE(LMNRST)-3^', 'FE', 'FE',
479
        'FOERDERN---^', 'FÖRD', 'FÖRT',
480
        'FOERDERN---', ' FÖRD', ' FÖRT',
481
        'FOND7', 'FON', 'FUN',
482
        'FRAIN$', 'FRA', 'FRA',
483
        'FRISEU(RS)-', 'FRISÖ', 'FRIZÖ',
484
        'FY9^', 'FÜ', None,
485
        'FÖRDERN---^', 'FÖRD', 'FÖRT',
486
        'FÖRDERN---', ' FÖRD', ' FÖRT',
487
        'GAGS^$', 'GEX', 'KEX',
488
        'GAG^$', 'GEK', 'KEK',
489
        'GD', 'KT', 'KT',
490
        'GEGEN^^', 'GEGN', 'KEKN',
491
        'GEGENGEKOM-----', 'GEGN ', 'KEKN ',
492
        'GEGENGESET-----', 'GEGN ', 'KEKN ',
493
        'GEGENKOMME-----', 'GEGN ', 'KEKN ',
494
        'GEGENZUKOM---', 'GEGN ZU ', 'KEKN ZU ',
495
        'GENDETWAS-----$', 'GENT ', 'KENT ',
496
        'GENRE', 'IORE', 'IURE',
497
        'GE(LMNRST)-3^', 'GE', 'KE',
498
        'GER(DKT)-', 'GER', None,
499
        'GETTE$', 'GET', 'KET',
500
        'GGF.', 'GF.', None,
501
        'GG-', '', '',
502
        'GH', 'G', None,
503
        'GI(AOU)-^', 'I', 'I',
504
        'GION-3', 'KIO', 'KIU',
505
        'G(CK)-', '', '',
506
        'GJ(AEIOU)-^', 'I', 'I',
507
        'GMBH^$', 'GMBH', 'GMBH',
508
        'GNAC$', 'NIAK', 'NIAK',
509
        'GNON$', 'NION', 'NIUN',
510
        'GN$', 'N', 'N',
511
        'GONCAL-^', 'GONZA', 'KUNZA',
512
        'GRY9^', 'GRÜ', None,
513
        'G(SßXZ)-<', 'K', 'K',
514
        'GUCK-', 'KU', 'KU',
515
        'GUISEP-^', 'IUSE', 'IUZE',
516
        'GUI-^', 'G', 'K',
517
        'GUTAUSSEH------^', 'GUT ', 'KUT ',
518
        'GUTGEHEND------^', 'GUT ', 'KUT ',
519
        'GY9^', 'GÜ', None,
520
        'G(AÄEILOÖRUÜY)-', 'G', None,
521
        'G(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'G', None,
522
        'G\'S$', 'X', 'X',
523
        'G´S$', 'X', 'X',
524
        'G^', 'G', None,
525
        'G', 'K', 'K',
526
        'HA(HIUY)--1', 'H', None,
527
        'HANDVOL---^', 'HANT ', 'ANT ',
528
        'HANNOVE-^', 'HANOF', None,
529
        'HAVEN7$', 'HAFN', None,
530
        'HEAD-', 'HE', 'E',
531
        'HELIEGEN------', 'E ', 'E ',
532
        'HESTEHEN------', 'E ', 'E ',
533
        'HE(LMNRST)-3^', 'HE', 'E',
534
        'HE(LMN)-1', 'E', 'E',
535
        'HEUR1$', 'ÖR', 'ÖR',
536
        'HE(HIUY)--1', 'H', None,
537
        'HIH(AÄEIOÖUÜY)-1', 'IH', None,
538
        'HLH(AÄEIOÖUÜY)-1', 'LH', None,
539
        'HMH(AÄEIOÖUÜY)-1', 'MH', None,
540
        'HNH(AÄEIOÖUÜY)-1', 'NH', None,
541
        'HOBBY9^', 'HOBI', None,
542
        'HOCHBEGAB-----^', 'HOCH ', 'UK ',
543
        'HOCHTALEN-----^', 'HOCH ', 'UK ',
544
        'HOCHZUFRI-----^', 'HOCH ', 'UK ',
545
        'HO(HIY)--1', 'H', None,
546
        'HRH(AÄEIOÖUÜY)-1', 'RH', None,
547
        'HUH(AÄEIOÖUÜY)-1', 'UH', None,
548
        'HUIS^^', 'HÜS', 'IZ',
549
        'HUIS$', 'ÜS', 'IZ',
550
        'HUI--1', 'H', None,
551
        'HYGIEN^', 'HÜKIEN', None,
552
        'HY9^', 'HÜ', None,
553
        'HY(BDGMNPST)-', 'Ü', None,
554
        'H.^', None, 'H.',
555
        'HÄU--1', 'H', None,
556
        'H^', 'H', '',
557
        'H', '', '',
558
        'ICHELL---', 'ISH', 'IZ',
559
        'ICHI$', 'ISHI', 'IZI',
560
        'IEC$', 'IZ', 'IZ',
561
        'IEDENSTELLE------', 'IDN ', 'ITN ',
562
        'IEI-3', '', '',
563
        'IELL3', 'IEL', 'IEL',
564
        'IENNE$', 'IN', 'IN',
565
        'IERRE$', 'IER', 'IER',
566
        'IERZULAN---', 'IR ZU ', 'IR ZU ',
567
        'IETTE$', 'IT', 'IT',
568
        'IEU', 'IÖ', 'IÖ',
569
        'IE<4', 'I', 'I',
570
        'IGL-1', 'IK', None,
571
        'IGHT3$', 'EIT', 'EIT',
572
        'IGNI(EO)-', 'INI', 'INI',
573
        'IGN(AEOU)-$', 'INI', 'INI',
574
        'IHER(DGLKRT)--1', 'IHE', None,
575
        'IHE(IUY)--', 'IH', None,
576
        'IH(AIOÖUÜY)-', 'IH', None,
577
        'IJ(AOU)-', 'I', 'I',
578
        'IJ$', 'I', 'I',
579
        'IJ<', 'EI', 'EI',
580
        'IKOLE$', 'IKOL', 'IKUL',
581
        'ILLAN(STZ)--4', 'ILIA', 'ILIA',
582
        'ILLAR(DT)--4', 'ILIA', 'ILIA',
583
        'IMSTAN----^', 'IM ', 'IN ',
584
        'INDELERREGE------', 'INDL ', 'INTL ',
585
        'INFRAGE-----^$', 'IN ', 'IN ',
586
        'INTERN(AOU)-^', 'INTAN', 'INTAN',
587
        'INVER-', 'INWE', 'INFE',
588
        'ITI(AÄIOÖUÜ)-', 'IZI', 'IZI',
589
        'IUSZ$', 'IUS', None,
590
        'IUTZ$', 'IUS', None,
591
        'IUZ$', 'IUS', None,
592
        'IVER--<', 'IW', None,
593
        'IVIER$', 'IWIE', 'IFIE',
594
        'IV(ÄOÖUÜ)-', 'IW', None,
595
        'IV<3', 'IW', None,
596
        'IY2', 'I', None,
597
        'I(ÈÉÊ)<4', 'I', 'I',
598
        'JAVIE---<^', 'ZA', 'ZA',
599
        'JEANS^$', 'JINS', 'INZ',
600
        'JEANNE^$', 'IAN', 'IAN',
601
        'JEAN-^', 'IA', 'IA',
602
        'JER-^', 'IE', 'IE',
603
        'JE(LMNST)-', 'IE', 'IE',
604
        'JI^', 'JI', None,
605
        'JOR(GK)^$', 'IÖRK', 'IÖRK',
606
        'J', 'I', 'I',
607
        'KC(ÄEIJ)-', 'X', 'X',
608
        'KD', 'KT', None,
609
        'KE(LMNRST)-3^', 'KE', 'KE',
610
        'KG(AÄEILOÖRUÜY)-', 'K', None,
611
        'KH<^', 'K', 'K',
612
        'KIC$', 'KIZ', 'KIZ',
613
        'KLE(LMNRST)-3^', 'KLE', 'KLE',
614
        'KOTELE-^', 'KOTL', 'KUTL',
615
        'KREAT-^', 'KREA', 'KREA',
616
        'KRÜS(TZ)--^', 'KRI', None,
617
        'KRYS(TZ)--^', 'KRI', None,
618
        'KRY9^', 'KRÜ', None,
619
        'KSCH---', 'K', 'K',
620
        'KSH--', 'K', 'K',
621
        'K(SßXZ)7', 'X', 'X',  # implies 'KST' -> 'XT'
622
        'KT\'S$', 'X', 'X',
623
        'KTI(AIOU)-3', 'XI', 'XI',
624
        'KT(SßXZ)', 'X', 'X',
625
        'KY9^', 'KÜ', None,
626
        'K\'S$', 'X', 'X',
627
        'K´S$', 'X', 'X',
628
        'LANGES$', ' LANGES', ' LANKEZ',
629
        'LANGE$', ' LANGE', ' LANKE',
630
        'LANG$', ' LANK', ' LANK',
631
        'LARVE-', 'LARF', 'LARF',
632
        'LD(SßZ)$', 'LS', 'LZ',
633
        'LD\'S$', 'LS', 'LZ',
634
        'LD´S$', 'LS', 'LZ',
635
        'LEAND-^', 'LEAN', 'LEAN',
636
        'LEERSTEHE-----^', 'LER ', 'LER ',
637
        'LEICHBLEIB-----', 'LEICH ', 'LEIK ',
638
        'LEICHLAUTE-----', 'LEICH ', 'LEIK ',
639
        'LEIDERREGE------', 'LEIT ', 'LEIT ',
640
        'LEIDGEPR----^', 'LEIT ', 'LEIT ',
641
        'LEINSTEHE-----', 'LEIN ', 'LEIN ',
642
        'LEL-', 'LE', 'LE',
643
        'LE(MNRST)-3^', 'LE', 'LE',
644
        'LETTE$', 'LET', 'LET',
645
        'LFGNAG-', 'LFGAN', 'LFKAN',
646
        'LICHERWEIS----', 'LICHA ', 'LIKA ',
647
        'LIC$', 'LIZ', 'LIZ',
648
        'LIVE^$', 'LEIF', 'LEIF',
649
        'LT(SßZ)$', 'LS', 'LZ',
650
        'LT\'S$', 'LS', 'LZ',
651
        'LT´S$', 'LS', 'LZ',
652
        'LUI(GS)--', 'LU', 'LU',
653
        'LV(AIO)-', 'LW', None,
654
        'LY9^', 'LÜ', None,
655
        'LSTS$', 'LS', 'LZ',
656
        'LZ(BDFGKLMNPQRSTVWX)-', 'LS', None,
657
        'L(SßZ)$', 'LS', None,
658
        'MAIR-<', 'MEI', 'NEI',
659
        'MANAG-', 'MENE', 'NENE',
660
        'MANUEL', 'MANUEL', None,
661
        'MASSEU(RS)-', 'MASÖ', 'NAZÖ',
662
        'MATCH', 'MESH', 'NEZ',
663
        'MAURICE', 'MORIS', 'NURIZ',
664
        'MBH^$', 'MBH', 'MBH',
665
        'MB(ßZ)$', 'MS', None,
666
        'MB(SßTZ)-', 'M', 'N',
667
        'MCG9^', 'MAK', 'NAK',
668
        'MC9^', 'MAK', 'NAK',
669
        'MEMOIR-^', 'MEMOA', 'NENUA',
670
        'MERHAVEN$', 'MAHAFN', None,
671
        'ME(LMNRST)-3^', 'ME', 'NE',
672
        'MEN(STZ)--3', 'ME', None,
673
        'MEN$', 'MEN', None,
674
        'MIGUEL-', 'MIGE', 'NIKE',
675
        'MIKE^$', 'MEIK', 'NEIK',
676
        'MITHILFE----^$', 'MIT H', 'NIT ',
677
        'MN$', 'M', None,
678
        'MN', 'N', 'N',
679
        'MPJUTE-', 'MPUT', 'NBUT',
680
        'MP(ßZ)$', 'MS', None,
681
        'MP(SßTZ)-', 'M', 'N',
682
        'MP(BDJLMNPQVW)-', 'MB', 'NB',
683
        'MY9^', 'MÜ', None,
684
        'M(ßZ)$', 'MS', None,
685
        'M´G7^', 'MAK', 'NAK',
686
        'M\'G7^', 'MAK', 'NAK',
687
        'M´^', 'MAK', 'NAK',
688
        'M\'^', 'MAK', 'NAK',
689
        'M', None, 'N',
690
        'NACH^^', 'NACH', 'NAK',
691
        'NADINE', 'NADIN', 'NATIN',
692
        'NAIV--', 'NA', 'NA',
693
        'NAISE$', 'NESE', 'NEZE',
694
        'NAUGENOMM------', 'NAU ', 'NAU ',
695
        'NAUSOGUT$', 'NAUSO GUT', 'NAUZU KUT',
696
        'NCH$', 'NSH', 'NZ',
697
        'NCOISE$', 'SOA', 'ZUA',
698
        'NCOIS$', 'SOA', 'ZUA',
699
        'NDAR$', 'NDA', 'NTA',
700
        'NDERINGEN------', 'NDE ', 'NTE ',
701
        'NDRO(CDKTZ)-', 'NTRO', None,
702
        'ND(BFGJLMNPQVW)-', 'NT', None,
703
        'ND(SßZ)$', 'NS', 'NZ',
704
        'ND\'S$', 'NS', 'NZ',
705
        'ND´S$', 'NS', 'NZ',
706
        'NEBEN^^', 'NEBN', 'NEBN',
707
        'NENGELERN------', 'NEN ', 'NEN ',
708
        'NENLERN(ET)---', 'NEN LE', 'NEN LE',
709
        'NENZULERNE---', 'NEN ZU LE', 'NEN ZU LE',
710
        'NE(LMNRST)-3^', 'NE', 'NE',
711
        'NEN-3', 'NE', 'NE',
712
        'NETTE$', 'NET', 'NET',
713
        'NGU^^', 'NU', 'NU',
714
        'NG(BDFJLMNPQRTVW)-', 'NK', 'NK',
715
        'NH(AUO)-$', 'NI', 'NI',
716
        'NICHTSAHNEN-----', 'NIX ', 'NIX ',
717
        'NICHTSSAGE----', 'NIX ', 'NIX ',
718
        'NICHTS^^', 'NIX', 'NIX',
719
        'NICHT^^', 'NICHT', 'NIKT',
720
        'NINE$', 'NIN', 'NIN',
721
        'NON^^', 'NON', 'NUN',
722
        'NOTLEIDE-----^', 'NOT ', 'NUT ',
723
        'NOT^^', 'NOT', 'NUT',
724
        'NTI(AIOU)-3', 'NZI', 'NZI',
725
        'NTIEL--3', 'NZI', 'NZI',
726
        'NT(SßZ)$', 'NS', 'NZ',
727
        'NT\'S$', 'NS', 'NZ',
728
        'NT´S$', 'NS', 'NZ',
729
        'NYLON', 'NEILON', 'NEILUN',
730
        'NY9^', 'NÜ', None,
731
        'NSTZUNEH---', 'NST ZU ', 'NZT ZU ',
732
        'NSZ-', 'NS', None,
733
        'NSTS$', 'NS', 'NZ',
734
        'NZ(BDFGKLMNPQRSTVWX)-', 'NS', None,
735
        'N(SßZ)$', 'NS', None,
736
        'OBERE-', 'OBER', None,
737
        'OBER^^', 'OBA', 'UBA',
738
        'OEU2', 'Ö', 'Ö',
739
        'OE<2', 'Ö', 'Ö',
740
        'OGL-', 'OK', None,
741
        'OGNIE-', 'ONI', 'UNI',
742
        'OGN(AEOU)-$', 'ONI', 'UNI',
743
        'OH(AIOÖUÜY)-', 'OH', None,
744
        'OIE$', 'Ö', 'Ö',
745
        'OIRE$', 'OA', 'UA',
746
        'OIR$', 'OA', 'UA',
747
        'OIX', 'OA', 'UA',
748
        'OI<3', 'EU', 'EU',
749
        'OKAY^$', 'OKE', 'UKE',
750
        'OLYN$', 'OLIN', 'ULIN',
751
        'OO(DLMZ)-', 'U', None,
752
        'OO$', 'U', None,
753
        'OO-', '', '',
754
        'ORGINAL-----', 'ORI', 'URI',
755
        'OTI(AÄOÖUÜ)-', 'OZI', 'UZI',
756
        'OUI^', 'WI', 'FI',
757
        'OUILLE$', 'ULIE', 'ULIE',
758
        'OU(DT)-^', 'AU', 'AU',
759
        'OUSE$', 'AUS', 'AUZ',
760
        'OUT-', 'AU', 'AU',
761
        'OU', 'U', 'U',
762
        'O(FV)$', 'AU', 'AU',  # due to 'OW$' -> 'AU'
763
        'OVER--<', 'OW', None,
764
        'OV(AOU)-', 'OW', None,
765
        'OW$', 'AU', 'AU',
766
        'OWS$', 'OS', 'UZ',
767
        'OJ(AÄEIOÖUÜ)--', 'O', 'U',
768
        'OYER', 'OIA', None,
769
        'OY(AÄEIOÖUÜ)--', 'O', 'U',
770
        'O(JY)<', 'EU', 'EU',
771
        'OZ$', 'OS', None,
772
        'O´^', 'O', 'U',
773
        'O\'^', 'O', 'U',
774
        'O', None, 'U',
775
        'PATIEN--^', 'PAZI', 'PAZI',
776
        'PENSIO-^', 'PANSI', 'PANZI',
777
        'PE(LMNRST)-3^', 'PE', 'PE',
778
        'PFER-^', 'FE', 'FE',
779
        'P(FH)<', 'F', 'F',
780
        'PIC^$', 'PIK', 'PIK',
781
        'PIC$', 'PIZ', 'PIZ',
782
        'PIPELINE', 'PEIBLEIN', 'PEIBLEIN',
783
        'POLYP-', 'POLÜ', None,
784
        'POLY^^', 'POLI', 'PULI',
785
        'PORTRAIT7', 'PORTRE', 'PURTRE',
786
        'POWER7', 'PAUA', 'PAUA',
787
        'PP(FH)--<', 'B', 'B',
788
        'PP-', '', '',
789
        'PRODUZ-^', 'PRODU', 'BRUTU',
790
        'PRODUZI--', ' PRODU', ' BRUTU',
791
        'PRIX^$', 'PRI', 'PRI',
792
        'PS-^^', 'P', None,
793
        'P(SßZ)^', None, 'Z',
794
        'P(SßZ)$', 'BS', None,
795
        'PT-^', '', '',
796
        'PTI(AÄOÖUÜ)-3', 'BZI', 'BZI',
797
        'PY9^', 'PÜ', None,
798
        'P(AÄEIOÖRUÜY)-', 'P', 'P',
799
        'P(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'P', None,
800
        'P.^', None, 'P.',
801
        'P^', 'P', None,
802
        'P', 'B', 'B',
803
        'QI-', 'Z', 'Z',
804
        'QUARANT--', 'KARA', 'KARA',
805
        'QUE(LMNRST)-3', 'KWE', 'KFE',
806
        'QUE$', 'K', 'K',
807
        'QUI(NS)$', 'KI', 'KI',
808
        'QUIZ7', 'KWIS', None,
809
        'Q(UV)7', 'KW', 'KF',
810
        'Q<', 'K', 'K',
811
        'RADFAHR----', 'RAT ', 'RAT ',
812
        'RAEFTEZEHRE-----', 'REFTE ', 'REFTE ',
813
        'RCH', 'RCH', 'RK',
814
        'REA(DU)---3^', 'R', None,
815
        'REBSERZEUG------', 'REBS ', 'REBZ ',
816
        'RECHERCH^', 'RESHASH', 'REZAZ',
817
        'RECYCL--', 'RIZEI', 'RIZEI',
818
        'RE(ALST)-3^', 'RE', None,
819
        'REE$', 'RI', 'RI',
820
        'RER$', 'RA', 'RA',
821
        'RE(MNR)-4', 'RE', 'RE',
822
        'RETTE$', 'RET', 'RET',
823
        'REUZ$', 'REUZ', None,
824
        'REW$', 'RU', 'RU',
825
        'RH<^', 'R', 'R',
826
        'RJA(MN)--', 'RI', 'RI',
827
        'ROWD-^', 'RAU', 'RAU',
828
        'RTEMONNAIE-', 'RTMON', 'RTNUN',
829
        'RTI(AÄOÖUÜ)-3', 'RZI', 'RZI',
830
        'RTIEL--3', 'RZI', 'RZI',
831
        'RV(AEOU)-3', 'RW', None,
832
        'RY(KN)-$', 'RI', 'RI',
833
        'RY9^', 'RÜ', None,
834
        'RÄFTEZEHRE-----', 'REFTE ', 'REFTE ',
835
        'SAISO-^', 'SES', 'ZEZ',
836
        'SAFE^$', 'SEIF', 'ZEIF',
837
        'SAUCE-^', 'SOS', 'ZUZ',
838
        'SCHLAGGEBEN-----<', 'SHLAK ', 'ZLAK ',
839
        'SCHSCH---7', '', '',
840
        'SCHTSCH', 'SH', 'Z',
841
        'SC(HZ)<', 'SH', 'Z',
842
        'SC', 'SK', 'ZK',
843
        'SELBSTST--7^^', 'SELB', 'ZELB',
844
        'SELBST7^^', 'SELBST', 'ZELBZT',
845
        'SERVICE7^', 'SÖRWIS', 'ZÖRFIZ',
846
        'SERVI-^', 'SERW', None,
847
        'SE(LMNRST)-3^', 'SE', 'ZE',
848
        'SETTE$', 'SET', 'ZET',
849
        'SHP-^', 'S', 'Z',
850
        'SHST', 'SHT', 'ZT',
851
        'SHTSH', 'SH', 'Z',
852
        'SHT', 'ST', 'Z',
853
        'SHY9^', 'SHÜ', None,
854
        'SH^^', 'SH', None,
855
        'SH3', 'SH', 'Z',
856
        'SICHERGEGAN-----^', 'SICHA ', 'ZIKA ',
857
        'SICHERGEHE----^', 'SICHA ', 'ZIKA ',
858
        'SICHERGESTEL------^', 'SICHA ', 'ZIKA ',
859
        'SICHERSTELL-----^', 'SICHA ', 'ZIKA ',
860
        'SICHERZU(GS)--^', 'SICHA ZU ', 'ZIKA ZU ',
861
        'SIEGLI-^', 'SIKL', 'ZIKL',
862
        'SIGLI-^', 'SIKL', 'ZIKL',
863
        'SIGHT', 'SEIT', 'ZEIT',
864
        'SIGN', 'SEIN', 'ZEIN',
865
        'SKI(NPZ)-', 'SKI', 'ZKI',
866
        'SKI<^', 'SHI', 'ZI',
867
        'SODASS^$', 'SO DAS', 'ZU TAZ',
868
        'SODAß^$', 'SO DAS', 'ZU TAZ',
869
        'SOGENAN--^', 'SO GEN', 'ZU KEN',
870
        'SOUND-', 'SAUN', 'ZAUN',
871
        'STAATS^^', 'STAZ', 'ZTAZ',
872
        'STADT^^', 'STAT', 'ZTAT',
873
        'STANDE$', ' STANDE', ' ZTANTE',
874
        'START^^', 'START', 'ZTART',
875
        'STAURANT7', 'STORAN', 'ZTURAN',
876
        'STEAK-', 'STE', 'ZTE',
877
        'STEPHEN-^$', 'STEW', None,
878
        'STERN', 'STERN', None,
879
        'STRAF^^', 'STRAF', 'ZTRAF',
880
        'ST\'S$', 'Z', 'Z',
881
        'ST´S$', 'Z', 'Z',
882
        'STST--', '', '',
883
        'STS(ACEÈÉÊHIÌÍÎOUÄÜÖ)--', 'ST', 'ZT',
884
        'ST(SZ)', 'Z', 'Z',
885
        'SPAREN---^', 'SPA', 'ZPA',
886
        'SPAREND----', ' SPA', ' ZPA',
887
        'S(PTW)-^^', 'S', None,
888
        'SP', 'SP', None,
889
        'STYN(AE)-$', 'STIN', 'ZTIN',
890
        'ST', 'ST', 'ZT',
891
        'SUITE<', 'SIUT', 'ZIUT',
892
        'SUKE--$', 'S', 'Z',
893
        'SURF(EI)-', 'SÖRF', 'ZÖRF',
894
        'SV(AEÈÉÊIÌÍÎOU)-<^', 'SW', None,
895
        'SYB(IY)--^', 'SIB', None,
896
        'SYL(KVW)--^', 'SI', None,
897
        'SY9^', 'SÜ', None,
898
        'SZE(NPT)-^', 'ZE', 'ZE',
899
        'SZI(ELN)-^', 'ZI', 'ZI',
900
        'SZCZ<', 'SH', 'Z',
901
        'SZT<', 'ST', 'ZT',
902
        'SZ<3', 'SH', 'Z',
903
        'SÜL(KVW)--^', 'SI', None,
904
        'S', None, 'Z',
905
        'TCH', 'SH', 'Z',
906
        'TD(AÄEIOÖRUÜY)-', 'T', None,
907
        'TD(ÀÁÂÃÅÈÉÊËÌÍÎÏÒÓÔÕØÙÚÛÝŸ)-', 'T', None,
908
        'TEAT-^', 'TEA', 'TEA',
909
        'TERRAI7^', 'TERA', 'TERA',
910
        'TE(LMNRST)-3^', 'TE', 'TE',
911
        'TH<', 'T', 'T',
912
        'TICHT-', 'TIK', 'TIK',
913
        'TICH$', 'TIK', 'TIK',
914
        'TIC$', 'TIZ', 'TIZ',
915
        'TIGGESTELL-------', 'TIK ', 'TIK ',
916
        'TIGSTELL-----', 'TIK ', 'TIK ',
917
        'TOAS-^', 'TO', 'TU',
918
        'TOILET-', 'TOLE', 'TULE',
919
        'TOIN-', 'TOA', 'TUA',
920
        'TRAECHTI-^', 'TRECHT', 'TREKT',
921
        'TRAECHTIG--', ' TRECHT', ' TREKT',
922
        'TRAINI-', 'TREN', 'TREN',
923
        'TRÄCHTI-^', 'TRECHT', 'TREKT',
924
        'TRÄCHTIG--', ' TRECHT', ' TREKT',
925
        'TSCH', 'SH', 'Z',
926
        'TSH', 'SH', 'Z',
927
        'TST', 'ZT', 'ZT',
928
        'T(Sß)', 'Z', 'Z',
929
        'TT(SZ)--<', '', '',
930
        'TT9', 'T', 'T',
931
        'TV^$', 'TV', 'TV',
932
        'TX(AEIOU)-3', 'SH', 'Z',
933
        'TY9^', 'TÜ', None,
934
        'TZ-', '', '',
935
        'T\'S3$', 'Z', 'Z',
936
        'T´S3$', 'Z', 'Z',
937
        'UEBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
938
        'UEBER^^', 'ÜBA', 'IBA',
939
        'UE2', 'Ü', 'I',
940
        'UGL-', 'UK', None,
941
        'UH(AOÖUÜY)-', 'UH', None,
942
        'UIE$', 'Ü', 'I',
943
        'UM^^', 'UM', 'UN',
944
        'UNTERE--3', 'UNTE', 'UNTE',
945
        'UNTER^^', 'UNTA', 'UNTA',
946
        'UNVER^^', 'UNFA', 'UNFA',
947
        'UN^^', 'UN', 'UN',
948
        'UTI(AÄOÖUÜ)-', 'UZI', 'UZI',
949
        'UVE-4', 'UW', None,
950
        'UY2', 'UI', None,
951
        'UZZ', 'AS', 'AZ',
952
        'VACL-^', 'WAZ', 'FAZ',
953
        'VAC$', 'WAZ', 'FAZ',
954
        'VAN DEN ^', 'FANDN', 'FANTN',
955
        'VANES-^', 'WANE', None,
956
        'VATRO-', 'WATR', None,
957
        'VA(DHJNT)--^', 'F', None,
958
        'VEDD-^', 'FE', 'FE',
959
        'VE(BEHIU)--^', 'F', None,
960
        'VEL(BDLMNT)-^', 'FEL', None,
961
        'VENTZ-^', 'FEN', None,
962
        'VEN(NRSZ)-^', 'FEN', None,
963
        'VER(AB)-^$', 'WER', None,
964
        'VERBAL^$', 'WERBAL', None,
965
        'VERBAL(EINS)-^', 'WERBAL', None,
966
        'VERTEBR--', 'WERTE', None,
967
        'VEREIN-----', 'F', None,
968
        'VEREN(AEIOU)-^', 'WEREN', None,
969
        'VERIFI', 'WERIFI', None,
970
        'VERON(AEIOU)-^', 'WERON', None,
971
        'VERSEN^', 'FERSN', 'FAZN',
972
        'VERSIERT--^', 'WERSI', None,
973
        'VERSIO--^', 'WERS', None,
974
        'VERSUS', 'WERSUS', None,
975
        'VERTI(GK)-', 'WERTI', None,
976
        'VER^^', 'FER', 'FA',
977
        'VERSPRECHE-------', ' FER', ' FA',
978
        'VER$', 'WA', None,
979
        'VER', 'FA', 'FA',
980
        'VET(HT)-^', 'FET', 'FET',
981
        'VETTE$', 'WET', 'FET',
982
        'VE^', 'WE', None,
983
        'VIC$', 'WIZ', 'FIZ',
984
        'VIELSAGE----', 'FIL ', 'FIL ',
985
        'VIEL', 'FIL', 'FIL',
986
        'VIEW', 'WIU', 'FIU',
987
        'VILL(AE)-', 'WIL', None,
988
        'VIS(ACEIKUVWZ)-<^', 'WIS', None,
989
        'VI(ELS)--^', 'F', None,
990
        'VILLON--', 'WILI', 'FILI',
991
        'VIZE^^', 'FIZE', 'FIZE',
992
        'VLIE--^', 'FL', None,
993
        'VL(AEIOU)--', 'W', None,
994
        'VOKA-^', 'WOK', None,
995
        'VOL(ATUVW)--^', 'WO', None,
996
        'VOR^^', 'FOR', 'FUR',
997
        'VR(AEIOU)--', 'W', None,
998
        'VV9', 'W', None,
999
        'VY9^', 'WÜ', 'FI',
1000
        'V(ÜY)-', 'W', None,
1001
        'V(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'W', None,
1002
        'V(AEIJLRU)-<', 'W', None,
1003
        'V.^', 'V.', None,
1004
        'V<', 'F', 'F',
1005
        'WEITERENTWI-----^', 'WEITA ', 'FEITA ',
1006
        'WEITREICH-----^', 'WEIT ', 'FEIT ',
1007
        'WEITVER^', 'WEIT FER', 'FEIT FA',
1008
        'WE(LMNRST)-3^', 'WE', 'FE',
1009
        'WER(DST)-', 'WER', None,
1010
        'WIC$', 'WIZ', 'FIZ',
1011
        'WIEDERU--', 'WIDE', 'FITE',
1012
        'WIEDER^$', 'WIDA', 'FITA',
1013
        'WIEDER^^', 'WIDA ', 'FITA ',
1014
        'WIEVIEL', 'WI FIL', 'FI FIL',
1015
        'WISUEL', 'WISUEL', None,
1016
        'WR-^', 'W', None,
1017
        'WY9^', 'WÜ', 'FI',
1018
        'W(BDFGJKLMNPQRSTZ)-', 'F', None,
1019
        'W$', 'F', None,
1020
        'W', None, 'F',
1021
        'X<^', 'Z', 'Z',
1022
        'XHAVEN$', 'XAFN', None,
1023
        'X(CSZ)', 'X', 'X',
1024
        'XTS(CH)--', 'XT', 'XT',
1025
        'XT(SZ)', 'Z', 'Z',
1026
        'YE(LMNRST)-3^', 'IE', 'IE',
1027
        'YE-3', 'I', 'I',
1028
        'YOR(GK)^$', 'IÖRK', 'IÖRK',
1029
        'Y(AOU)-<7', 'I', 'I',
1030
        'Y(BKLMNPRSTX)-1', 'Ü', None,
1031
        'YVES^$', 'IF', 'IF',
1032
        'YVONNE^$', 'IWON', 'IFUN',
1033
        'Y.^', 'Y.', None,
1034
        'Y', 'I', 'I',
1035
        'ZC(AOU)-', 'SK', 'ZK',
1036
        'ZE(LMNRST)-3^', 'ZE', 'ZE',
1037
        'ZIEJ$', 'ZI', 'ZI',
1038
        'ZIGERJA(HR)-3', 'ZIGA IA', 'ZIKA IA',
1039
        'ZL(AEIOU)-', 'SL', None,
1040
        'ZS(CHT)--', '', '',
1041
        'ZS', 'SH', 'Z',
1042
        'ZUERST', 'ZUERST', 'ZUERST',
1043
        'ZUGRUNDE^$', 'ZU GRUNDE', 'ZU KRUNTE',
1044
        'ZUGRUNDE', 'ZU GRUNDE ', 'ZU KRUNTE ',
1045
        'ZUGUNSTEN', 'ZU GUNSTN', 'ZU KUNZTN',
1046
        'ZUHAUSE-', 'ZU HAUS', 'ZU AUZ',
1047
        'ZULASTEN^$', 'ZU LASTN', 'ZU LAZTN',
1048
        'ZURUECK^^', 'ZURÜK', 'ZURIK',
1049
        'ZURZEIT', 'ZUR ZEIT', 'ZUR ZEIT',
1050
        'ZURÜCK^^', 'ZURÜK', 'ZURIK',
1051
        'ZUSTANDE', 'ZU STANDE', 'ZU ZTANTE',
1052
        'ZUTAGE', 'ZU TAGE', 'ZU TAKE',
1053
        'ZUVER^^', 'ZUFA', 'ZUFA',
1054
        'ZUVIEL', 'ZU FIL', 'ZU FIL',
1055
        'ZUWENIG', 'ZU WENIK', 'ZU FENIK',
1056
        'ZY9^', 'ZÜ', None,
1057
        'ZYK3$', 'ZIK', None,
1058
        'Z(VW)7^', 'SW', None,
1059
        None, None, None
1060
        # fmt: on
1061
    )
1062
1063 1
    _upper_trans = dict(
1064
        zip(
1065
            (
1066
                ord(_)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
1067
                for _ in 'abcdefghijklmnopqrstuvwxyzàáâãåäæ'
1068
                + 'çðèéêëìíîïñòóôõöøœšßþùúûüýÿ'
1069
            ),
1070
            'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÅÄÆ'
1071
            + 'ÇÐÈÉÊËÌÍÎÏÑÒÓÔÕÖØŒŠßÞÙÚÛÜÝŸ',
1072
        )
1073
    )
1074
1075 1
    def encode(self, word, mode=1, lang='de'):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'encode' method
Loading history...
1076
        """Return the phonet code for a word.
1077
1078
        Args:
1079
            word (str): The word to transform
1080
            mode (int): The ponet variant to employ (1 or 2)
1081
            lang (str): 'de' (default) for German, 'none' for no language
1082
1083
        Returns:
1084
            str: The phonet value
1085
1086
        Examples:
1087
            >>> pe = Phonet()
1088
            >>> pe.encode('Christopher')
1089
            'KRISTOFA'
1090
            >>> pe.encode('Niall')
1091
            'NIAL'
1092
            >>> pe.encode('Smith')
1093
            'SMIT'
1094
            >>> pe.encode('Schmidt')
1095
            'SHMIT'
1096
1097
            >>> pe.encode('Christopher', mode=2)
1098
            'KRIZTUFA'
1099
            >>> pe.encode('Niall', mode=2)
1100
            'NIAL'
1101
            >>> pe.encode('Smith', mode=2)
1102
            'ZNIT'
1103
            >>> pe.encode('Schmidt', mode=2)
1104
            'ZNIT'
1105
1106
            >>> pe.encode('Christopher', lang='none')
1107
            'CHRISTOPHER'
1108
            >>> pe.encode('Niall', lang='none')
1109
            'NIAL'
1110
            >>> pe.encode('Smith', lang='none')
1111
            'SMITH'
1112
            >>> pe.encode('Schmidt', lang='none')
1113
            'SCHMIDT'
1114
1115
        """
1116 1
        phonet_hash = Counter()
1117 1
        alpha_pos = Counter()
1118
1119 1
        phonet_hash_1 = Counter()
1120 1
        phonet_hash_2 = Counter()
1121
1122 1
        def _initialize_phonet(lang):
1123
            """Initialize phonet variables.
1124
1125
            Args:
1126
                lang (str): Language to use for rules
1127
1128
            """
1129 1
            if lang == 'none':
1130 1
                _phonet_rules = self._rules_no_lang
1131
            else:
1132 1
                _phonet_rules = self._rules_german
1133
1134 1
            phonet_hash[''] = -1
1135
1136
            # German and international umlauts
1137 1
            for j in {
1138
                'À',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1139
                'Á',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1140
                'Â',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1141
                'Ã',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1142
                'Ä',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1143
                'Å',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1144
                'Æ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1145
                'Ç',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1146
                'È',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1147
                'É',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1148
                'Ê',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1149
                'Ë',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1150
                'Ì',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1151
                'Í',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1152
                'Î',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1153
                'Ï',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1154
                'Ð',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1155
                'Ñ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1156
                'Ò',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1157
                'Ó',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1158
                'Ô',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1159
                'Õ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1160
                'Ö',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1161
                'Ø',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1162
                'Ù',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1163
                'Ú',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1164
                'Û',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1165
                'Ü',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1166
                'Ý',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1167
                'Þ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1168
                'ß',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1169
                'Œ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1170
                'Š',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1171
                'Ÿ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1172
            }:
1173 1
                alpha_pos[j] = 1
1174 1
                phonet_hash[j] = -1
1175
1176
            # "normal" letters ('A'-'Z')
1177 1
            for i, j in enumerate('ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
1178 1
                alpha_pos[j] = i + 2
1179 1
                phonet_hash[j] = -1
1180
1181 1
            for i in range(26):
1182 1
                for j in range(28):
1183 1
                    phonet_hash_1[i, j] = -1
1184 1
                    phonet_hash_2[i, j] = -1
1185
1186
            # for each phonetc rule
1187 1
            for i in range(len(_phonet_rules)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
1188 1
                rule = _phonet_rules[i]
1189
1190 1
                if rule and i % 3 == 0:
1191
                    # calculate first hash value
1192 1
                    k = _phonet_rules[i][0]
1193
1194 1
                    if phonet_hash[k] < 0 and (
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash does not seem to be defined.
Loading history...
1195
                        _phonet_rules[i + 1] or _phonet_rules[i + 2]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1196
                    ):
1197 1
                        phonet_hash[k] = i
1198
1199
                    # calculate second hash values
1200 1
                    if k and alpha_pos[k] >= 2:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable alpha_pos does not seem to be defined.
Loading history...
1201 1
                        k = alpha_pos[k]
1202
1203 1
                        j = k - 2
1204 1
                        rule = rule[1:]
1205
1206 1
                        if not rule:
1207 1
                            rule = ' '
1208 1
                        elif rule[0] == '(':
1209 1
                            rule = rule[1:]
1210
                        else:
1211 1
                            rule = rule[0]
1212
1213 1
                        while rule and (rule[0] != ')'):
1214 1
                            k = alpha_pos[rule[0]]
1215
1216 1
                            if k > 0:
1217
                                # add hash value for this letter
1218 1
                                if phonet_hash_1[j, k] < 0:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash_1 does not seem to be defined.
Loading history...
1219 1
                                    phonet_hash_1[j, k] = i
1220 1
                                    phonet_hash_2[j, k] = i
1221
1222 1
                                if phonet_hash_2[j, k] >= (i - 30):
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash_2 does not seem to be defined.
Loading history...
1223 1
                                    phonet_hash_2[j, k] = i
1224
                                else:
1225 1
                                    k = -1
1226
1227 1
                            if k <= 0:
1228
                                # add hash value for all letters
1229 1
                                if phonet_hash_1[j, 0] < 0:
1230 1
                                    phonet_hash_1[j, 0] = i
1231
1232 1
                                phonet_hash_2[j, 0] = i
1233
1234 1
                            rule = rule[1:]
1235
1236 1
        def _phonet(term, mode, lang):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (29/15).
Loading history...
1237
            """Return the phonet coded form of a term.
1238
1239
            Args:
1240
                term (str): Term to transform
1241
                mode (int): The ponet variant to employ (1 or 2)
1242
                lang (str): 'de' (default) for German, 'none' for no language
1243
1244
            Returns:
1245
                str: The phonet value
1246
1247
            """
1248 1
            if lang == 'none':
1249 1
                _phonet_rules = self._rules_no_lang
1250
            else:
1251 1
                _phonet_rules = self._rules_german
1252
1253 1
            char0 = ''
1254 1
            dest = term
1255
1256 1
            if not term:
1257 1
                return ''
1258
1259 1
            term_length = len(term)
1260
1261
            # convert input string to upper-case
1262 1
            src = term.translate(self._upper_trans)
1263
1264
            # check "src"
1265 1
            i = 0
1266 1
            j = 0
1267 1
            zeta = 0
1268
1269 1
            while i < len(src):
0 ignored issues
show
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
unused-code introduced by
Too many nested blocks (8/5)
Loading history...
unused-code introduced by
Too many nested blocks (9/5)
Loading history...
unused-code introduced by
Too many nested blocks (7/5)
Loading history...
1270 1
                char = src[i]
1271
1272 1
                pos = alpha_pos[char]
1273
1274 1
                if pos >= 2:
1275 1
                    xpos = pos - 2
1276
1277 1
                    if i + 1 == len(src):
1278 1
                        pos = alpha_pos['']
1279
                    else:
1280 1
                        pos = alpha_pos[src[i + 1]]
1281
1282 1
                    start1 = phonet_hash_1[xpos, pos]
1283 1
                    start2 = phonet_hash_1[xpos, 0]
1284 1
                    end1 = phonet_hash_2[xpos, pos]
1285 1
                    end2 = phonet_hash_2[xpos, 0]
1286
1287
                    # preserve rule priorities
1288 1
                    if (start2 >= 0) and ((start1 < 0) or (start2 < start1)):
1289 1
                        pos = start1
1290 1
                        start1 = start2
1291 1
                        start2 = pos
1292 1
                        pos = end1
1293 1
                        end1 = end2
1294 1
                        end2 = pos
1295
1296 1
                    if (end1 >= start2) and (start2 >= 0):
1297 1
                        if end2 > end1:
1298 1
                            end1 = end2
1299
1300 1
                        start2 = -1
1301 1
                        end2 = -1
1302
                else:
1303 1
                    pos = phonet_hash[char]
1304 1
                    start1 = pos
1305 1
                    end1 = 10000
1306 1
                    start2 = -1
1307 1
                    end2 = -1
1308
1309 1
                pos = start1
1310 1
                zeta0 = 0
1311
1312 1
                if pos >= 0:
1313
                    # check rules for this char
1314 1
                    while (_phonet_rules[pos] is None) or (
1315
                        _phonet_rules[pos][0] == char
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1316
                    ):
1317 1
                        if pos > end1:
1318 1
                            if start2 > 0:
1319 1
                                pos = start2
1320 1
                                start1 = start2
1321 1
                                start2 = -1
1322 1
                                end1 = end2
1323 1
                                end2 = -1
1324 1
                                continue
1325
1326 1
                            break
1327
1328 1
                        if (_phonet_rules[pos] is None) or (
1329
                            _phonet_rules[pos + mode] is None
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1330
                        ):
1331
                            # no conversion rule available
1332 1
                            pos += 3
1333 1
                            continue
1334
1335
                        # check whole string
1336 1
                        matches = 1  # number of matching letters
1337 1
                        priority = 5  # default priority
1338 1
                        rule = _phonet_rules[pos]
1339 1
                        rule = rule[1:]
1340
1341 1
                        while (
1342
                            rule
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1343
                            and (len(src) > (i + matches))
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1344
                            and (src[i + matches] == rule[0])
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1345
                            and not rule[0].isdigit()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1346
                            and (rule not in '(-<^$')
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1347
                        ):
1348 1
                            matches += 1
1349 1
                            rule = rule[1:]
1350
1351 1
                        if rule and (rule[0] == '('):
1352
                            # check an array of letters
1353 1
                            if (
1354
                                (len(src) > (i + matches))
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1355
                                and src[i + matches].isalpha()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1356
                                and (src[i + matches] in rule[1:])
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1357
                            ):
1358 1
                                matches += 1
1359
1360 1
                                while rule and rule[0] != ')':
1361 1
                                    rule = rule[1:]
1362
1363
                                # if rule[0] == ')':
1364 1
                                rule = rule[1:]
1365
1366 1
                        if rule:
1367 1
                            priority0 = ord(rule[0])
1368
                        else:
1369 1
                            priority0 = 0
1370
1371 1
                        matches0 = matches
1372
1373 1
                        while rule and rule[0] == '-' and matches > 1:
1374 1
                            matches -= 1
1375 1
                            rule = rule[1:]
1376
1377 1
                        if rule and rule[0] == '<':
1378 1
                            rule = rule[1:]
1379
1380 1
                        if rule and rule[0].isdigit():
1381
                            # read priority
1382 1
                            priority = int(rule[0])
1383 1
                            rule = rule[1:]
1384
1385 1
                        if rule and rule[0:2] == '^^':
1386 1
                            rule = rule[1:]
1387
1388 1
                        if (
1389
                            not rule
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
best-practice introduced by
Too many boolean expressions in if statement (12/5)
Loading history...
1390
                            or (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1391
                                (rule[0] == '^')
1392
                                and ((i == 0) or not src[i - 1].isalpha())
1393
                                and (
1394
                                    (rule[1:2] != '$')
1395
                                    or (
1396
                                        not (
1397
                                            src[
1398
                                                i + matches0 : i + matches0 + 1
1399
                                            ].isalpha()
1400
                                        )
1401
                                        and (
1402
                                            src[
1403
                                                i + matches0 : i + matches0 + 1
1404
                                            ]
1405
                                            != '.'
1406
                                        )
1407
                                    )
1408
                                )
1409
                            )
1410
                            or (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1411
                                (rule[0] == '$')
1412
                                and (i > 0)
1413
                                and src[i - 1].isalpha()
1414
                                and (
1415
                                    (
1416
                                        not src[
1417
                                            i + matches0 : i + matches0 + 1
1418
                                        ].isalpha()
1419
                                    )
1420
                                    and (
1421
                                        src[i + matches0 : i + matches0 + 1]
1422
                                        != '.'
1423
                                    )
1424
                                )
1425
                            )
1426
                        ):
1427
                            # look for continuation, if:
1428
                            # matches > 1 und NO '-' in first string */
1429 1
                            pos0 = -1
1430
1431 1
                            start3 = 0
1432 1
                            start4 = 0
1433 1
                            end3 = 0
1434 1
                            end4 = 0
1435
1436 1
                            if (
1437
                                (matches > 1)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1438
                                and src[i + matches : i + matches + 1]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1439
                                and (priority0 != ord('-'))
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1440
                            ):
1441 1
                                char0 = src[i + matches - 1]
1442 1
                                pos0 = alpha_pos[char0]
1443
1444 1
                                if pos0 >= 2 and src[i + matches]:
1445 1
                                    xpos = pos0 - 2
1446 1
                                    pos0 = alpha_pos[src[i + matches]]
1447 1
                                    start3 = phonet_hash_1[xpos, pos0]
1448 1
                                    start4 = phonet_hash_1[xpos, 0]
1449 1
                                    end3 = phonet_hash_2[xpos, pos0]
1450 1
                                    end4 = phonet_hash_2[xpos, 0]
1451
1452
                                    # preserve rule priorities
1453 1
                                    if (start4 >= 0) and (
1454
                                        (start3 < 0) or (start4 < start3)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1455
                                    ):
1456 1
                                        pos0 = start3
1457 1
                                        start3 = start4
1458 1
                                        start4 = pos0
1459 1
                                        pos0 = end3
1460 1
                                        end3 = end4
1461 1
                                        end4 = pos0
1462
1463 1
                                    if (end3 >= start4) and (start4 >= 0):
1464 1
                                        if end4 > end3:
1465 1
                                            end3 = end4
1466
1467 1
                                        start4 = -1
1468 1
                                        end4 = -1
1469
                                else:
1470 1
                                    pos0 = phonet_hash[char0]
1471 1
                                    start3 = pos0
1472 1
                                    end3 = 10000
1473 1
                                    start4 = -1
1474 1
                                    end4 = -1
1475
1476 1
                                pos0 = start3
1477
1478
                            # check continuation rules for src[i+matches]
1479 1
                            if pos0 >= 0:
1480 1
                                while (_phonet_rules[pos0] is None) or (
1481
                                    _phonet_rules[pos0][0] == char0
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
introduced by
Value '_phonet_rules[pos0]' is unsubscriptable
Loading history...
1482
                                ):
1483 1
                                    if pos0 > end3:
1484 1
                                        if start4 > 0:
1485 1
                                            pos0 = start4
1486 1
                                            start3 = start4
1487 1
                                            start4 = -1
1488 1
                                            end3 = end4
1489 1
                                            end4 = -1
1490 1
                                            continue
1491
1492 1
                                        priority0 = -1
1493
1494
                                        # important
1495 1
                                        break
1496
1497 1
                                    if (_phonet_rules[pos0] is None) or (
1498
                                        _phonet_rules[pos0 + mode] is None
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1499
                                    ):
1500
                                        # no conversion rule available
1501 1
                                        pos0 += 3
1502 1
                                        continue
1503
1504
                                    # check whole string
1505 1
                                    matches0 = matches
1506 1
                                    priority0 = 5
1507 1
                                    rule = _phonet_rules[pos0]
1508 1
                                    rule = rule[1:]
0 ignored issues
show
introduced by
Value 'rule' is unsubscriptable
Loading history...
1509
1510 1
                                    while (
1511
                                        rule
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1512
                                        and (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1513
                                            src[
1514
                                                i + matches0 : i + matches0 + 1
1515
                                            ]
1516
                                            == rule[0]
1517
                                        )
1518
                                        and (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1519
                                            not rule[0].isdigit()
1520
                                            or (rule in '(-<^$')
1521
                                        )
1522
                                    ):
1523 1
                                        matches0 += 1
1524 1
                                        rule = rule[1:]
1525
1526 1
                                    if rule and rule[0] == '(':
1527
                                        # check an array of letters
1528 1
                                        if src[
1529
                                            i + matches0 : i + matches0 + 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1530
                                        ].isalpha() and (
1531
                                            src[i + matches0] in rule[1:]
1532
                                        ):
1533 1
                                            matches0 += 1
1534
1535 1
                                            while rule and rule[0] != ')':
1536 1
                                                rule = rule[1:]
1537
1538
                                            # if rule[0] == ')':
1539 1
                                            rule = rule[1:]
1540
1541 1
                                    while rule and rule[0] == '-':
1542
                                        # "matches0" is NOT decremented
1543
                                        # because of
1544
                                        #    "if (matches0 == matches)"
1545 1
                                        rule = rule[1:]
1546
1547 1
                                    if rule and rule[0] == '<':
1548 1
                                        rule = rule[1:]
1549
1550 1
                                    if rule and rule[0].isdigit():
1551 1
                                        priority0 = int(rule[0])
1552 1
                                        rule = rule[1:]
1553
1554 1
                                    if (
1555
                                        not rule
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1556
                                        or
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1557
                                        # rule == '^' is not possible here
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1558
                                        (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1559
                                            (rule[0] == '$')
1560
                                            and not src[
1561
                                                i + matches0 : i + matches0 + 1
1562
                                            ].isalpha()
1563
                                            and (
1564
                                                src[
1565
                                                    i
1566
                                                    + matches0 : i
1567
                                                    + matches0
1568
                                                    + 1
1569
                                                ]
1570
                                                != '.'
1571
                                            )
1572
                                        )
1573
                                    ):
1574 1
                                        if matches0 == matches:
1575
                                            # this is only a partial string
1576 1
                                            pos0 += 3
1577 1
                                            continue
1578
1579 1
                                        if priority0 < priority:
1580
                                            # priority is too low
1581 1
                                            pos0 += 3
1582 1
                                            continue
1583
1584
                                        # continuation rule found
1585 1
                                        break
1586
1587 1
                                    pos0 += 3
1588
1589
                                # end of "while"
1590 1
                                if (priority0 >= priority) and (
1591
                                    (_phonet_rules[pos0] is not None)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1592
                                    and (_phonet_rules[pos0][0] == char0)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
introduced by
Value '_phonet_rules[pos0]' is unsubscriptable
Loading history...
1593
                                ):
1594
1595 1
                                    pos += 3
1596 1
                                    continue
1597
1598
                            # replace string
1599 1
                            if _phonet_rules[pos] and (
1600
                                '<' in _phonet_rules[pos][1:]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1601
                            ):
1602 1
                                priority0 = 1
1603
                            else:
1604 1
                                priority0 = 0
1605
1606 1
                            rule = _phonet_rules[pos + mode]
1607
1608 1
                            if (priority0 == 1) and (zeta == 0):
1609
                                # rule with '<' is applied
1610 1
                                if (
1611
                                    (j > 0)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1612
                                    and rule
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1613
                                    and (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1614
                                        (dest[j - 1] == char)
1615
                                        or (dest[j - 1] == rule[0])
1616
                                    )
1617
                                ):
1618 1
                                    j -= 1
1619
1620 1
                                zeta0 = 1
1621 1
                                zeta += 1
1622 1
                                matches0 = 0
1623
1624 1
                                while rule and src[i + matches0]:
1625 1
                                    src = (
1626
                                        src[0 : i + matches0]
1627
                                        + rule[0]
1628
                                        + src[i + matches0 + 1 :]
1629
                                    )
1630 1
                                    matches0 += 1
1631 1
                                    rule = rule[1:]
1632
1633 1
                                if matches0 < matches:
1634 1
                                    src = (
1635
                                        src[0 : i + matches0]
1636
                                        + src[i + matches :]
1637
                                    )
1638
1639 1
                                char = src[i]
1640
                            else:
1641 1
                                i = i + matches - 1
1642 1
                                zeta = 0
1643
1644 1
                                while len(rule) > 1:
1645 1
                                    if (j == 0) or (dest[j - 1] != rule[0]):
1646 1
                                        dest = (
1647
                                            dest[0:j]
1648
                                            + rule[0]
1649
                                            + dest[min(len(dest), j + 1) :]
1650
                                        )
1651 1
                                        j += 1
1652
1653 1
                                    rule = rule[1:]
1654
1655
                                # new "current char"
1656 1
                                if not rule:
1657 1
                                    rule = ''
1658 1
                                    char = ''
1659
                                else:
1660 1
                                    char = rule[0]
1661
1662 1
                                if (
1663
                                    _phonet_rules[pos]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1664
                                    and '^^' in _phonet_rules[pos][1:]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1665
                                ):
1666 1
                                    if char:
1667 1
                                        dest = (
1668
                                            dest[0:j]
1669
                                            + char
1670
                                            + dest[min(len(dest), j + 1) :]
1671
                                        )
1672 1
                                        j += 1
1673
1674 1
                                    src = src[i + 1 :]
1675 1
                                    i = 0
1676 1
                                    zeta0 = 1
1677
1678 1
                            break
1679
1680 1
                        pos += 3
1681
1682 1
                        if pos > end1 and start2 > 0:
1683 1
                            pos = start2
1684 1
                            start1 = start2
1685 1
                            end1 = end2
1686 1
                            start2 = -1
1687 1
                            end2 = -1
1688
1689 1
                if zeta0 == 0:
1690 1
                    if char and ((j == 0) or (dest[j - 1] != char)):
1691
                        # delete multiple letters only
1692 1
                        dest = (
1693
                            dest[0:j] + char + dest[min(j + 1, term_length) :]
1694
                        )
1695 1
                        j += 1
1696
1697 1
                    i += 1
1698 1
                    zeta = 0
1699
1700 1
            dest = dest[0:j]
1701
1702 1
            return dest
1703
1704 1
        _initialize_phonet(lang)
1705
1706 1
        word = unicode_normalize('NFKC', text_type(word))
1707 1
        return _phonet(word, mode, lang)
1708
1709
1710 1
def phonet(word, mode=1, lang='de'):
1711
    """Return the phonet code for a word.
1712
1713
    This is a wrapper for :py:meth:`Phonet.encode`.
1714
1715
    Args:
1716
        word (str): The word to transform
1717
        mode (int): The ponet variant to employ (1 or 2)
1718
        lang (str): 'de' (default) for German, 'none' for no language
1719
1720
    Returns:
1721
        str: The phonet value
1722
1723
    Examples:
1724
        >>> phonet('Christopher')
1725
        'KRISTOFA'
1726
        >>> phonet('Niall')
1727
        'NIAL'
1728
        >>> phonet('Smith')
1729
        'SMIT'
1730
        >>> phonet('Schmidt')
1731
        'SHMIT'
1732
1733
        >>> phonet('Christopher', mode=2)
1734
        'KRIZTUFA'
1735
        >>> phonet('Niall', mode=2)
1736
        'NIAL'
1737
        >>> phonet('Smith', mode=2)
1738
        'ZNIT'
1739
        >>> phonet('Schmidt', mode=2)
1740
        'ZNIT'
1741
1742
        >>> phonet('Christopher', lang='none')
1743
        'CHRISTOPHER'
1744
        >>> phonet('Niall', lang='none')
1745
        'NIAL'
1746
        >>> phonet('Smith', lang='none')
1747
        'SMITH'
1748
        >>> phonet('Schmidt', lang='none')
1749
        'SCHMIDT'
1750
1751
    """
1752 1
    return Phonet().encode(word, mode, lang)
1753
1754
1755
if __name__ == '__main__':
1756
    import doctest
1757
1758
    doctest.testmod()
1759