Completed
Branch master (78a222)
by Chris
14:36
created

abydos.phonetic._phonet   F

Complexity

Total Complexity 142

Size/Duplication

Total Lines 1673
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
wmc 142
eloc 1386
dl 0
loc 1673
ccs 279
cts 279
cp 1
rs 0.8
c 0
b 0
f 0

1 Function

Rating   Name   Duplication   Size   Complexity  
F phonet() 0 1631 142

How to fix   Complexity   

Complexity

Complex classes like abydos.phonetic._phonet often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# -*- coding: utf-8 -*-
0 ignored issues
show
coding-style introduced by
Too many lines in module (1672/1000)
Loading history...
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._phonet.
20
21
The phonetic._phonet module implements phonet algorithm (a.k.a. Hannoveraner
22
Phonetik), intended chiefly for German.
23
"""
24
25 1
from __future__ import unicode_literals
26
27 1
from collections import Counter
28 1
from unicodedata import normalize as unicode_normalize
29
30 1
from six import text_type
31 1
from six.moves import range
32
33 1
__all__ = ['phonet']
34
35
36 1
def phonet(word, mode=1, lang='de'):
37
    """Return the phonet code for a word.
38
39
    phonet ("Hannoveraner Phonetik") was developed by Jörg Michael and
40
    documented in :cite:`Michael:1999`.
41
42
    This is a port of Jesper Zedlitz's code, which is licensed LGPL
43
    :cite:`Zedlitz:2015`.
44
45
    That is, in turn, based on Michael's C code, which is also licensed LGPL
46
    :cite:`Michael:2007`.
47
48
    :param str word: the word to transform
49
    :param int mode: the ponet variant to employ (1 or 2)
50
    :param str lang: 'de' (default) for German
51
            'none' for no language
52
    :returns: the phonet value
53
    :rtype: str
54
55
    >>> phonet('Christopher')
56
    'KRISTOFA'
57
    >>> phonet('Niall')
58
    'NIAL'
59
    >>> phonet('Smith')
60
    'SMIT'
61
    >>> phonet('Schmidt')
62
    'SHMIT'
63
64
    >>> phonet('Christopher', mode=2)
65
    'KRIZTUFA'
66
    >>> phonet('Niall', mode=2)
67
    'NIAL'
68
    >>> phonet('Smith', mode=2)
69
    'ZNIT'
70
    >>> phonet('Schmidt', mode=2)
71
    'ZNIT'
72
73
    >>> phonet('Christopher', lang='none')
74
    'CHRISTOPHER'
75
    >>> phonet('Niall', lang='none')
76
    'NIAL'
77
    >>> phonet('Smith', lang='none')
78
    'SMITH'
79
    >>> phonet('Schmidt', lang='none')
80
    'SCHMIDT'
81
    """
82 1
    _phonet_rules_no_lang = (  # separator chars
83
        # fmt: off
84
        '´', ' ', ' ',
85
        '"', ' ', ' ',
86
        '`$', '', '',
87
        '\'', ' ', ' ',
88
        ',', ',', ',',
89
        ';', ',', ',',
90
        '-', ' ', ' ',
91
        ' ', ' ', ' ',
92
        '.', '.', '.',
93
        ':', '.', '.',
94
        # German umlauts
95
        'Ä', 'AE', 'AE',
96
        'Ö', 'OE', 'OE',
97
        'Ü', 'UE', 'UE',
98
        'ß', 'S', 'S',
99
        # international umlauts
100
        'À', 'A', 'A',
101
        'Á', 'A', 'A',
102
        'Â', 'A', 'A',
103
        'Ã', 'A', 'A',
104
        'Å', 'A', 'A',
105
        'Æ', 'AE', 'AE',
106
        'Ç', 'C', 'C',
107
        'Ð', 'DJ', 'DJ',
108
        'È', 'E', 'E',
109
        'É', 'E', 'E',
110
        'Ê', 'E', 'E',
111
        'Ë', 'E', 'E',
112
        'Ì', 'I', 'I',
113
        'Í', 'I', 'I',
114
        'Î', 'I', 'I',
115
        'Ï', 'I', 'I',
116
        'Ñ', 'NH', 'NH',
117
        'Ò', 'O', 'O',
118
        'Ó', 'O', 'O',
119
        'Ô', 'O', 'O',
120
        'Õ', 'O', 'O',
121
        'Œ', 'OE', 'OE',
122
        'Ø', 'OE', 'OE',
123
        'Š', 'SH', 'SH',
124
        'Þ', 'TH', 'TH',
125
        'Ù', 'U', 'U',
126
        'Ú', 'U', 'U',
127
        'Û', 'U', 'U',
128
        'Ý', 'Y', 'Y',
129
        'Ÿ', 'Y', 'Y',
130
        # 'normal' letters (A-Z)
131
        'MC^', 'MAC', 'MAC',
132
        'MC^', 'MAC', 'MAC',
133
        'M´^', 'MAC', 'MAC',
134
        'M\'^', 'MAC', 'MAC',
135
        'O´^', 'O', 'O',
136
        'O\'^', 'O', 'O',
137
        'VAN DEN ^', 'VANDEN', 'VANDEN',
138
        None, None, None
139
        # fmt: on
140
    )
141
142 1
    _phonet_rules_german = (  # separator chars
143
        # fmt: off
144
        '´', ' ', ' ',
145
        '"', ' ', ' ',
146
        '`$', '', '',
147
        '\'', ' ', ' ',
148
        ',', ' ', ' ',
149
        ';', ' ', ' ',
150
        '-', ' ', ' ',
151
        ' ', ' ', ' ',
152
        '.', '.', '.',
153
        ':', '.', '.',
154
        # German umlauts
155
        'ÄE', 'E', 'E',
156
        'ÄU<', 'EU', 'EU',
157
        'ÄV(AEOU)-<', 'EW', None,
158
        'Ä$', 'Ä', None,
159
        'Ä<', None, 'E',
160
        'Ä', 'E', None,
161
        'ÖE', 'Ö', 'Ö',
162
        'ÖU', 'Ö', 'Ö',
163
        'ÖVER--<', 'ÖW', None,
164
        'ÖV(AOU)-', 'ÖW', None,
165
        'ÜBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
166
        'ÜBER^^', 'ÜBA', 'IBA',
167
        'ÜE', 'Ü', 'I',
168
        'ÜVER--<', 'ÜW', None,
169
        'ÜV(AOU)-', 'ÜW', None,
170
        'Ü', None, 'I',
171
        'ßCH<', None, 'Z',
172
        'ß<', 'S', 'Z',
173
        # international umlauts
174
        'À<', 'A', 'A',
175
        'Á<', 'A', 'A',
176
        'Â<', 'A', 'A',
177
        'Ã<', 'A', 'A',
178
        'Å<', 'A', 'A',
179
        'ÆER-', 'E', 'E',
180
        'ÆU<', 'EU', 'EU',
181
        'ÆV(AEOU)-<', 'EW', None,
182
        'Æ$', 'Ä', None,
183
        'Æ<', None, 'E',
184
        'Æ', 'E', None,
185
        'Ç', 'Z', 'Z',
186
        'ÐÐ-', '', '',
187
        'Ð', 'DI', 'TI',
188
        'È<', 'E', 'E',
189
        'É<', 'E', 'E',
190
        'Ê<', 'E', 'E',
191
        'Ë', 'E', 'E',
192
        'Ì<', 'I', 'I',
193
        'Í<', 'I', 'I',
194
        'Î<', 'I', 'I',
195
        'Ï', 'I', 'I',
196
        'ÑÑ-', '', '',
197
        'Ñ', 'NI', 'NI',
198
        'Ò<', 'O', 'U',
199
        'Ó<', 'O', 'U',
200
        'Ô<', 'O', 'U',
201
        'Õ<', 'O', 'U',
202
        'Œ<', 'Ö', 'Ö',
203
        'Ø(IJY)-<', 'E', 'E',
204
        'Ø<', 'Ö', 'Ö',
205
        'Š', 'SH', 'Z',
206
        'Þ', 'T', 'T',
207
        'Ù<', 'U', 'U',
208
        'Ú<', 'U', 'U',
209
        'Û<', 'U', 'U',
210
        'Ý<', 'I', 'I',
211
        'Ÿ<', 'I', 'I',
212
        # 'normal' letters (A-Z)
213
        'ABELLE$', 'ABL', 'ABL',
214
        'ABELL$', 'ABL', 'ABL',
215
        'ABIENNE$', 'ABIN', 'ABIN',
216
        'ACHME---^', 'ACH', 'AK',
217
        'ACEY$', 'AZI', 'AZI',
218
        'ADV', 'ATW', None,
219
        'AEGL-', 'EK', None,
220
        'AEU<', 'EU', 'EU',
221
        'AE2', 'E', 'E',
222
        'AFTRAUBEN------', 'AFT ', 'AFT ',
223
        'AGL-1', 'AK', None,
224
        'AGNI-^', 'AKN', 'AKN',
225
        'AGNIE-', 'ANI', 'ANI',
226
        'AGN(AEOU)-$', 'ANI', 'ANI',
227
        'AH(AIOÖUÜY)-', 'AH', None,
228
        'AIA2', 'AIA', 'AIA',
229
        'AIE$', 'E', 'E',
230
        'AILL(EOU)-', 'ALI', 'ALI',
231
        'AINE$', 'EN', 'EN',
232
        'AIRE$', 'ER', 'ER',
233
        'AIR-', 'E', 'E',
234
        'AISE$', 'ES', 'EZ',
235
        'AISSANCE$', 'ESANS', 'EZANZ',
236
        'AISSE$', 'ES', 'EZ',
237
        'AIX$', 'EX', 'EX',
238
        'AJ(AÄEÈÉÊIOÖUÜ)--', 'A', 'A',
239
        'AKTIE', 'AXIE', 'AXIE',
240
        'AKTUEL', 'AKTUEL', None,
241
        'ALOI^', 'ALOI', 'ALUI',  # Don't merge these rules
242
        'ALOY^', 'ALOI', 'ALUI',  # needed by 'check_rules'
243
        'AMATEU(RS)-', 'AMATÖ', 'ANATÖ',
244
        'ANCH(OEI)-', 'ANSH', 'ANZ',
245
        'ANDERGEGANG----', 'ANDA GE', 'ANTA KE',
246
        'ANDERGEHE----', 'ANDA ', 'ANTA ',
247
        'ANDERGESETZ----', 'ANDA GE', 'ANTA KE',
248
        'ANDERGING----', 'ANDA ', 'ANTA ',
249
        'ANDERSETZ(ET)-----', 'ANDA ', 'ANTA ',
250
        'ANDERZUGEHE----', 'ANDA ZU ', 'ANTA ZU ',
251
        'ANDERZUSETZE-----', 'ANDA ZU ', 'ANTA ZU ',
252
        'ANER(BKO)---^^', 'AN', None,
253
        'ANHAND---^$', 'AN H', 'AN ',
254
        'ANH(AÄEIOÖUÜY)--^^', 'AN', None,
255
        'ANIELLE$', 'ANIEL', 'ANIL',
256
        'ANIEL', 'ANIEL', None,
257
        'ANSTELLE----^$', 'AN ST', 'AN ZT',
258
        'ANTI^^', 'ANTI', 'ANTI',
259
        'ANVER^^', 'ANFA', 'ANFA',
260
        'ATIA$', 'ATIA', 'ATIA',
261
        'ATIA(NS)--', 'ATI', 'ATI',
262
        'ATI(AÄOÖUÜ)-', 'AZI', 'AZI',
263
        'AUAU--', '', '',
264
        'AUERE$', 'AUERE', None,
265
        'AUERE(NS)-$', 'AUERE', None,
266
        'AUERE(AIOUY)--', 'AUER', None,
267
        'AUER(AÄIOÖUÜY)-', 'AUER', None,
268
        'AUER<', 'AUA', 'AUA',
269
        'AUF^^', 'AUF', 'AUF',
270
        'AULT$', 'O', 'U',
271
        'AUR(BCDFGKLMNQSTVWZ)-', 'AUA', 'AUA',
272
        'AUR$', 'AUA', 'AUA',
273
        'AUSSE$', 'OS', 'UZ',
274
        'AUS(ST)-^', 'AUS', 'AUS',
275
        'AUS^^', 'AUS', 'AUS',
276
        'AUTOFAHR----', 'AUTO ', 'AUTU ',
277
        'AUTO^^', 'AUTO', 'AUTU',
278
        'AUX(IY)-', 'AUX', 'AUX',
279
        'AUX', 'O', 'U',
280
        'AU', 'AU', 'AU',
281
        'AVER--<', 'AW', None,
282
        'AVIER$', 'AWIE', 'AFIE',
283
        'AV(EÈÉÊI)-^', 'AW', None,
284
        'AV(AOU)-', 'AW', None,
285
        'AYRE$', 'EIRE', 'EIRE',
286
        'AYRE(NS)-$', 'EIRE', 'EIRE',
287
        'AYRE(AIOUY)--', 'EIR', 'EIR',
288
        'AYR(AÄIOÖUÜY)-', 'EIR', 'EIR',
289
        'AYR<', 'EIA', 'EIA',
290
        'AYER--<', 'EI', 'EI',
291
        'AY(AÄEIOÖUÜY)--', 'A', 'A',
292
        'AË', 'E', 'E',
293
        'A(IJY)<', 'EI', 'EI',
294
        'BABY^$', 'BEBI', 'BEBI',
295
        'BAB(IY)^', 'BEBI', 'BEBI',
296
        'BEAU^$', 'BO', None,
297
        'BEA(BCMNRU)-^', 'BEA', 'BEA',
298
        'BEAT(AEIMORU)-^', 'BEAT', 'BEAT',
299
        'BEE$', 'BI', 'BI',
300
        'BEIGE^$', 'BESH', 'BEZ',
301
        'BENOIT--', 'BENO', 'BENU',
302
        'BER(DT)-', 'BER', None,
303
        'BERN(DT)-', 'BERN', None,
304
        'BE(LMNRST)-^', 'BE', 'BE',
305
        'BETTE$', 'BET', 'BET',
306
        'BEVOR^$', 'BEFOR', None,
307
        'BIC$', 'BIZ', 'BIZ',
308
        'BOWL(EI)-', 'BOL', 'BUL',
309
        'BP(AÄEÈÉÊIÌÍÎOÖRUÜY)-', 'B', 'B',
310
        'BRINGEND-----^', 'BRI', 'BRI',
311
        'BRINGEND-----', ' BRI', ' BRI',
312
        'BROW(NS)-', 'BRAU', 'BRAU',
313
        'BUDGET7', 'BÜGE', 'BIKE',
314
        'BUFFET7', 'BÜFE', 'BIFE',
315
        'BYLLE$', 'BILE', 'BILE',
316
        'BYLL$', 'BIL', 'BIL',
317
        'BYPA--^', 'BEI', 'BEI',
318
        'BYTE<', 'BEIT', 'BEIT',
319
        'BY9^', 'BÜ', None,
320
        'B(SßZ)$', 'BS', None,
321
        'CACH(EI)-^', 'KESH', 'KEZ',
322
        'CAE--', 'Z', 'Z',
323
        'CA(IY)$', 'ZEI', 'ZEI',
324
        'CE(EIJUY)--', 'Z', 'Z',
325
        'CENT<', 'ZENT', 'ZENT',
326
        'CERST(EI)----^', 'KE', 'KE',
327
        'CER$', 'ZA', 'ZA',
328
        'CE3', 'ZE', 'ZE',
329
        'CH\'S$', 'X', 'X',
330
        'CH´S$', 'X', 'X',
331
        'CHAO(ST)-', 'KAO', 'KAU',
332
        'CHAMPIO-^', 'SHEMPI', 'ZENBI',
333
        'CHAR(AI)-^', 'KAR', 'KAR',
334
        'CHAU(CDFSVWXZ)-', 'SHO', 'ZU',
335
        'CHÄ(CF)-', 'SHE', 'ZE',
336
        'CHE(CF)-', 'SHE', 'ZE',
337
        'CHEM-^', 'KE', 'KE',  # or: 'CHE', 'KE'
338
        'CHEQUE<', 'SHEK', 'ZEK',
339
        'CHI(CFGPVW)-', 'SHI', 'ZI',
340
        'CH(AEUY)-<^', 'SH', 'Z',
341
        'CHK-', '', '',
342
        'CHO(CKPS)-^', 'SHO', 'ZU',
343
        'CHRIS-', 'KRI', None,
344
        'CHRO-', 'KR', None,
345
        'CH(LOR)-<^', 'K', 'K',
346
        'CHST-', 'X', 'X',
347
        'CH(SßXZ)3', 'X', 'X',
348
        'CHTNI-3', 'CHN', 'KN',
349
        'CH^', 'K', 'K',  # or: 'CH', 'K'
350
        'CH', 'CH', 'K',
351
        'CIC$', 'ZIZ', 'ZIZ',
352
        'CIENCEFICT----', 'EIENS ', 'EIENZ ',
353
        'CIENCE$', 'EIENS', 'EIENZ',
354
        'CIER$', 'ZIE', 'ZIE',
355
        'CYB-^', 'ZEI', 'ZEI',
356
        'CY9^', 'ZÜ', 'ZI',
357
        'C(IJY)-<3', 'Z', 'Z',
358
        'CLOWN-', 'KLAU', 'KLAU',
359
        'CCH', 'Z', 'Z',
360
        'CCE-', 'X', 'X',
361
        'C(CK)-', '', '',
362
        'CLAUDET---', 'KLO', 'KLU',
363
        'CLAUDINE^$', 'KLODIN', 'KLUTIN',
364
        'COACH', 'KOSH', 'KUZ',
365
        'COLE$', 'KOL', 'KUL',
366
        'COUCH', 'KAUSH', 'KAUZ',
367
        'COW', 'KAU', 'KAU',
368
        'CQUES$', 'K', 'K',
369
        'CQUE', 'K', 'K',
370
        'CRASH--9', 'KRE', 'KRE',
371
        'CREAT-^', 'KREA', 'KREA',
372
        'CST', 'XT', 'XT',
373
        'CS<^', 'Z', 'Z',
374
        'C(SßX)', 'X', 'X',
375
        'CT\'S$', 'X', 'X',
376
        'CT(SßXZ)', 'X', 'X',
377
        'CZ<', 'Z', 'Z',
378
        'C(ÈÉÊÌÍÎÝ)3', 'Z', 'Z',
379
        'C.^', 'C.', 'C.',
380
        'CÄ-', 'Z', 'Z',
381
        'CÜ$', 'ZÜ', 'ZI',
382
        'C\'S$', 'X', 'X',
383
        'C<', 'K', 'K',
384
        'DAHER^$', 'DAHER', None,
385
        'DARAUFFOLGE-----', 'DARAUF ', 'TARAUF ',
386
        'DAVO(NR)-^$', 'DAFO', 'TAFU',
387
        'DD(SZ)--<', '', '',
388
        'DD9', 'D', None,
389
        'DEPOT7', 'DEPO', 'TEBU',
390
        'DESIGN', 'DISEIN', 'TIZEIN',
391
        'DE(LMNRST)-3^', 'DE', 'TE',
392
        'DETTE$', 'DET', 'TET',
393
        'DH$', 'T', None,
394
        'DIC$', 'DIZ', 'TIZ',
395
        'DIDR-^', 'DIT', None,
396
        'DIEDR-^', 'DIT', None,
397
        'DJ(AEIOU)-^', 'I', 'I',
398
        'DMITR-^', 'DIMIT', 'TINIT',
399
        'DRY9^', 'DRÜ', None,
400
        'DT-', '', '',
401
        'DUIS-^', 'DÜ', 'TI',
402
        'DURCH^^', 'DURCH', 'TURK',
403
        'DVA$', 'TWA', None,
404
        'DY9^', 'DÜ', None,
405
        'DYS$', 'DIS', None,
406
        'DS(CH)--<', 'T', 'T',
407
        'DST', 'ZT', 'ZT',
408
        'DZS(CH)--', 'T', 'T',
409
        'D(SßZ)', 'Z', 'Z',
410
        'D(AÄEIOÖRUÜY)-', 'D', None,
411
        'D(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'D', None,
412
        'D\'H^', 'D', 'T',
413
        'D´H^', 'D', 'T',
414
        'D`H^', 'D', 'T',
415
        'D\'S3$', 'Z', 'Z',
416
        'D´S3$', 'Z', 'Z',
417
        'D^', 'D', None,
418
        'D', 'T', 'T',
419
        'EAULT$', 'O', 'U',
420
        'EAUX$', 'O', 'U',
421
        'EAU', 'O', 'U',
422
        'EAV', 'IW', 'IF',
423
        'EAS3$', 'EAS', None,
424
        'EA(AÄEIOÖÜY)-3', 'EA', 'EA',
425
        'EA3$', 'EA', 'EA',
426
        'EA3', 'I', 'I',
427
        'EBENSO^$', 'EBNSO', 'EBNZU',
428
        'EBENSO^^', 'EBNSO ', 'EBNZU ',
429
        'EBEN^^', 'EBN', 'EBN',
430
        'EE9', 'E', 'E',
431
        'EGL-1', 'EK', None,
432
        'EHE(IUY)--1', 'EH', None,
433
        'EHUNG---1', 'E', None,
434
        'EH(AÄIOÖUÜY)-1', 'EH', None,
435
        'EIEI--', '', '',
436
        'EIERE^$', 'EIERE', None,
437
        'EIERE$', 'EIERE', None,
438
        'EIERE(NS)-$', 'EIERE', None,
439
        'EIERE(AIOUY)--', 'EIER', None,
440
        'EIER(AÄIOÖUÜY)-', 'EIER', None,
441
        'EIER<', 'EIA', None,
442
        'EIGL-1', 'EIK', None,
443
        'EIGH$', 'EI', 'EI',
444
        'EIH--', 'E', 'E',
445
        'EILLE$', 'EI', 'EI',
446
        'EIR(BCDFGKLMNQSTVWZ)-', 'EIA', 'EIA',
447
        'EIR$', 'EIA', 'EIA',
448
        'EITRAUBEN------', 'EIT ', 'EIT ',
449
        'EI', 'EI', 'EI',
450
        'EJ$', 'EI', 'EI',
451
        'ELIZ^', 'ELIS', None,
452
        'ELZ^', 'ELS', None,
453
        'EL-^', 'E', 'E',
454
        'ELANG----1', 'E', 'E',
455
        'EL(DKL)--1', 'E', 'E',
456
        'EL(MNT)--1$', 'E', 'E',
457
        'ELYNE$', 'ELINE', 'ELINE',
458
        'ELYN$', 'ELIN', 'ELIN',
459
        'EL(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'EL', 'EL',
460
        'EL-1', 'L', 'L',
461
        'EM-^', None, 'E',
462
        'EM(DFKMPQT)--1', None, 'E',
463
        'EM(AÄEÈÉÊIÌÍÎOÖUÜY)--1', None, 'E',
464
        'EM-1', None, 'N',
465
        'ENGAG-^', 'ANGA', 'ANKA',
466
        'EN-^', 'E', 'E',
467
        'ENTUEL', 'ENTUEL', None,
468
        'EN(CDGKQSTZ)--1', 'E', 'E',
469
        'EN(AÄEÈÉÊIÌÍÎNOÖUÜY)-1', 'EN', 'EN',
470
        'EN-1', '', '',
471
        'ERH(AÄEIOÖUÜ)-^', 'ERH', 'ER',
472
        'ER-^', 'E', 'E',
473
        'ERREGEND-----', ' ER', ' ER',
474
        'ERT1$', 'AT', None,
475
        'ER(DGLKMNRQTZß)-1', 'ER', None,
476
        'ER(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'ER', 'A',
477
        'ER1$', 'A', 'A',
478
        'ER<1', 'A', 'A',
479
        'ETAT7', 'ETA', 'ETA',
480
        'ETI(AÄOÖÜU)-', 'EZI', 'EZI',
481
        'EUERE$', 'EUERE', None,
482
        'EUERE(NS)-$', 'EUERE', None,
483
        'EUERE(AIOUY)--', 'EUER', None,
484
        'EUER(AÄIOÖUÜY)-', 'EUER', None,
485
        'EUER<', 'EUA', None,
486
        'EUEU--', '', '',
487
        'EUILLE$', 'Ö', 'Ö',
488
        'EUR$', 'ÖR', 'ÖR',
489
        'EUX', 'Ö', 'Ö',
490
        'EUSZ$', 'EUS', None,
491
        'EUTZ$', 'EUS', None,
492
        'EUYS$', 'EUS', 'EUZ',
493
        'EUZ$', 'EUS', None,
494
        'EU', 'EU', 'EU',
495
        'EVER--<1', 'EW', None,
496
        'EV(ÄOÖUÜ)-1', 'EW', None,
497
        'EYER<', 'EIA', 'EIA',
498
        'EY<', 'EI', 'EI',
499
        'FACETTE', 'FASET', 'FAZET',
500
        'FANS--^$', 'FE', 'FE',
501
        'FAN-^$', 'FE', 'FE',
502
        'FAULT-', 'FOL', 'FUL',
503
        'FEE(DL)-', 'FI', 'FI',
504
        'FEHLER', 'FELA', 'FELA',
505
        'FE(LMNRST)-3^', 'FE', 'FE',
506
        'FOERDERN---^', 'FÖRD', 'FÖRT',
507
        'FOERDERN---', ' FÖRD', ' FÖRT',
508
        'FOND7', 'FON', 'FUN',
509
        'FRAIN$', 'FRA', 'FRA',
510
        'FRISEU(RS)-', 'FRISÖ', 'FRIZÖ',
511
        'FY9^', 'FÜ', None,
512
        'FÖRDERN---^', 'FÖRD', 'FÖRT',
513
        'FÖRDERN---', ' FÖRD', ' FÖRT',
514
        'GAGS^$', 'GEX', 'KEX',
515
        'GAG^$', 'GEK', 'KEK',
516
        'GD', 'KT', 'KT',
517
        'GEGEN^^', 'GEGN', 'KEKN',
518
        'GEGENGEKOM-----', 'GEGN ', 'KEKN ',
519
        'GEGENGESET-----', 'GEGN ', 'KEKN ',
520
        'GEGENKOMME-----', 'GEGN ', 'KEKN ',
521
        'GEGENZUKOM---', 'GEGN ZU ', 'KEKN ZU ',
522
        'GENDETWAS-----$', 'GENT ', 'KENT ',
523
        'GENRE', 'IORE', 'IURE',
524
        'GE(LMNRST)-3^', 'GE', 'KE',
525
        'GER(DKT)-', 'GER', None,
526
        'GETTE$', 'GET', 'KET',
527
        'GGF.', 'GF.', None,
528
        'GG-', '', '',
529
        'GH', 'G', None,
530
        'GI(AOU)-^', 'I', 'I',
531
        'GION-3', 'KIO', 'KIU',
532
        'G(CK)-', '', '',
533
        'GJ(AEIOU)-^', 'I', 'I',
534
        'GMBH^$', 'GMBH', 'GMBH',
535
        'GNAC$', 'NIAK', 'NIAK',
536
        'GNON$', 'NION', 'NIUN',
537
        'GN$', 'N', 'N',
538
        'GONCAL-^', 'GONZA', 'KUNZA',
539
        'GRY9^', 'GRÜ', None,
540
        'G(SßXZ)-<', 'K', 'K',
541
        'GUCK-', 'KU', 'KU',
542
        'GUISEP-^', 'IUSE', 'IUZE',
543
        'GUI-^', 'G', 'K',
544
        'GUTAUSSEH------^', 'GUT ', 'KUT ',
545
        'GUTGEHEND------^', 'GUT ', 'KUT ',
546
        'GY9^', 'GÜ', None,
547
        'G(AÄEILOÖRUÜY)-', 'G', None,
548
        'G(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'G', None,
549
        'G\'S$', 'X', 'X',
550
        'G´S$', 'X', 'X',
551
        'G^', 'G', None,
552
        'G', 'K', 'K',
553
        'HA(HIUY)--1', 'H', None,
554
        'HANDVOL---^', 'HANT ', 'ANT ',
555
        'HANNOVE-^', 'HANOF', None,
556
        'HAVEN7$', 'HAFN', None,
557
        'HEAD-', 'HE', 'E',
558
        'HELIEGEN------', 'E ', 'E ',
559
        'HESTEHEN------', 'E ', 'E ',
560
        'HE(LMNRST)-3^', 'HE', 'E',
561
        'HE(LMN)-1', 'E', 'E',
562
        'HEUR1$', 'ÖR', 'ÖR',
563
        'HE(HIUY)--1', 'H', None,
564
        'HIH(AÄEIOÖUÜY)-1', 'IH', None,
565
        'HLH(AÄEIOÖUÜY)-1', 'LH', None,
566
        'HMH(AÄEIOÖUÜY)-1', 'MH', None,
567
        'HNH(AÄEIOÖUÜY)-1', 'NH', None,
568
        'HOBBY9^', 'HOBI', None,
569
        'HOCHBEGAB-----^', 'HOCH ', 'UK ',
570
        'HOCHTALEN-----^', 'HOCH ', 'UK ',
571
        'HOCHZUFRI-----^', 'HOCH ', 'UK ',
572
        'HO(HIY)--1', 'H', None,
573
        'HRH(AÄEIOÖUÜY)-1', 'RH', None,
574
        'HUH(AÄEIOÖUÜY)-1', 'UH', None,
575
        'HUIS^^', 'HÜS', 'IZ',
576
        'HUIS$', 'ÜS', 'IZ',
577
        'HUI--1', 'H', None,
578
        'HYGIEN^', 'HÜKIEN', None,
579
        'HY9^', 'HÜ', None,
580
        'HY(BDGMNPST)-', 'Ü', None,
581
        'H.^', None, 'H.',
582
        'HÄU--1', 'H', None,
583
        'H^', 'H', '',
584
        'H', '', '',
585
        'ICHELL---', 'ISH', 'IZ',
586
        'ICHI$', 'ISHI', 'IZI',
587
        'IEC$', 'IZ', 'IZ',
588
        'IEDENSTELLE------', 'IDN ', 'ITN ',
589
        'IEI-3', '', '',
590
        'IELL3', 'IEL', 'IEL',
591
        'IENNE$', 'IN', 'IN',
592
        'IERRE$', 'IER', 'IER',
593
        'IERZULAN---', 'IR ZU ', 'IR ZU ',
594
        'IETTE$', 'IT', 'IT',
595
        'IEU', 'IÖ', 'IÖ',
596
        'IE<4', 'I', 'I',
597
        'IGL-1', 'IK', None,
598
        'IGHT3$', 'EIT', 'EIT',
599
        'IGNI(EO)-', 'INI', 'INI',
600
        'IGN(AEOU)-$', 'INI', 'INI',
601
        'IHER(DGLKRT)--1', 'IHE', None,
602
        'IHE(IUY)--', 'IH', None,
603
        'IH(AIOÖUÜY)-', 'IH', None,
604
        'IJ(AOU)-', 'I', 'I',
605
        'IJ$', 'I', 'I',
606
        'IJ<', 'EI', 'EI',
607
        'IKOLE$', 'IKOL', 'IKUL',
608
        'ILLAN(STZ)--4', 'ILIA', 'ILIA',
609
        'ILLAR(DT)--4', 'ILIA', 'ILIA',
610
        'IMSTAN----^', 'IM ', 'IN ',
611
        'INDELERREGE------', 'INDL ', 'INTL ',
612
        'INFRAGE-----^$', 'IN ', 'IN ',
613
        'INTERN(AOU)-^', 'INTAN', 'INTAN',
614
        'INVER-', 'INWE', 'INFE',
615
        'ITI(AÄIOÖUÜ)-', 'IZI', 'IZI',
616
        'IUSZ$', 'IUS', None,
617
        'IUTZ$', 'IUS', None,
618
        'IUZ$', 'IUS', None,
619
        'IVER--<', 'IW', None,
620
        'IVIER$', 'IWIE', 'IFIE',
621
        'IV(ÄOÖUÜ)-', 'IW', None,
622
        'IV<3', 'IW', None,
623
        'IY2', 'I', None,
624
        'I(ÈÉÊ)<4', 'I', 'I',
625
        'JAVIE---<^', 'ZA', 'ZA',
626
        'JEANS^$', 'JINS', 'INZ',
627
        'JEANNE^$', 'IAN', 'IAN',
628
        'JEAN-^', 'IA', 'IA',
629
        'JER-^', 'IE', 'IE',
630
        'JE(LMNST)-', 'IE', 'IE',
631
        'JI^', 'JI', None,
632
        'JOR(GK)^$', 'IÖRK', 'IÖRK',
633
        'J', 'I', 'I',
634
        'KC(ÄEIJ)-', 'X', 'X',
635
        'KD', 'KT', None,
636
        'KE(LMNRST)-3^', 'KE', 'KE',
637
        'KG(AÄEILOÖRUÜY)-', 'K', None,
638
        'KH<^', 'K', 'K',
639
        'KIC$', 'KIZ', 'KIZ',
640
        'KLE(LMNRST)-3^', 'KLE', 'KLE',
641
        'KOTELE-^', 'KOTL', 'KUTL',
642
        'KREAT-^', 'KREA', 'KREA',
643
        'KRÜS(TZ)--^', 'KRI', None,
644
        'KRYS(TZ)--^', 'KRI', None,
645
        'KRY9^', 'KRÜ', None,
646
        'KSCH---', 'K', 'K',
647
        'KSH--', 'K', 'K',
648
        'K(SßXZ)7', 'X', 'X',  # implies 'KST' -> 'XT'
649
        'KT\'S$', 'X', 'X',
650
        'KTI(AIOU)-3', 'XI', 'XI',
651
        'KT(SßXZ)', 'X', 'X',
652
        'KY9^', 'KÜ', None,
653
        'K\'S$', 'X', 'X',
654
        'K´S$', 'X', 'X',
655
        'LANGES$', ' LANGES', ' LANKEZ',
656
        'LANGE$', ' LANGE', ' LANKE',
657
        'LANG$', ' LANK', ' LANK',
658
        'LARVE-', 'LARF', 'LARF',
659
        'LD(SßZ)$', 'LS', 'LZ',
660
        'LD\'S$', 'LS', 'LZ',
661
        'LD´S$', 'LS', 'LZ',
662
        'LEAND-^', 'LEAN', 'LEAN',
663
        'LEERSTEHE-----^', 'LER ', 'LER ',
664
        'LEICHBLEIB-----', 'LEICH ', 'LEIK ',
665
        'LEICHLAUTE-----', 'LEICH ', 'LEIK ',
666
        'LEIDERREGE------', 'LEIT ', 'LEIT ',
667
        'LEIDGEPR----^', 'LEIT ', 'LEIT ',
668
        'LEINSTEHE-----', 'LEIN ', 'LEIN ',
669
        'LEL-', 'LE', 'LE',
670
        'LE(MNRST)-3^', 'LE', 'LE',
671
        'LETTE$', 'LET', 'LET',
672
        'LFGNAG-', 'LFGAN', 'LFKAN',
673
        'LICHERWEIS----', 'LICHA ', 'LIKA ',
674
        'LIC$', 'LIZ', 'LIZ',
675
        'LIVE^$', 'LEIF', 'LEIF',
676
        'LT(SßZ)$', 'LS', 'LZ',
677
        'LT\'S$', 'LS', 'LZ',
678
        'LT´S$', 'LS', 'LZ',
679
        'LUI(GS)--', 'LU', 'LU',
680
        'LV(AIO)-', 'LW', None,
681
        'LY9^', 'LÜ', None,
682
        'LSTS$', 'LS', 'LZ',
683
        'LZ(BDFGKLMNPQRSTVWX)-', 'LS', None,
684
        'L(SßZ)$', 'LS', None,
685
        'MAIR-<', 'MEI', 'NEI',
686
        'MANAG-', 'MENE', 'NENE',
687
        'MANUEL', 'MANUEL', None,
688
        'MASSEU(RS)-', 'MASÖ', 'NAZÖ',
689
        'MATCH', 'MESH', 'NEZ',
690
        'MAURICE', 'MORIS', 'NURIZ',
691
        'MBH^$', 'MBH', 'MBH',
692
        'MB(ßZ)$', 'MS', None,
693
        'MB(SßTZ)-', 'M', 'N',
694
        'MCG9^', 'MAK', 'NAK',
695
        'MC9^', 'MAK', 'NAK',
696
        'MEMOIR-^', 'MEMOA', 'NENUA',
697
        'MERHAVEN$', 'MAHAFN', None,
698
        'ME(LMNRST)-3^', 'ME', 'NE',
699
        'MEN(STZ)--3', 'ME', None,
700
        'MEN$', 'MEN', None,
701
        'MIGUEL-', 'MIGE', 'NIKE',
702
        'MIKE^$', 'MEIK', 'NEIK',
703
        'MITHILFE----^$', 'MIT H', 'NIT ',
704
        'MN$', 'M', None,
705
        'MN', 'N', 'N',
706
        'MPJUTE-', 'MPUT', 'NBUT',
707
        'MP(ßZ)$', 'MS', None,
708
        'MP(SßTZ)-', 'M', 'N',
709
        'MP(BDJLMNPQVW)-', 'MB', 'NB',
710
        'MY9^', 'MÜ', None,
711
        'M(ßZ)$', 'MS', None,
712
        'M´G7^', 'MAK', 'NAK',
713
        'M\'G7^', 'MAK', 'NAK',
714
        'M´^', 'MAK', 'NAK',
715
        'M\'^', 'MAK', 'NAK',
716
        'M', None, 'N',
717
        'NACH^^', 'NACH', 'NAK',
718
        'NADINE', 'NADIN', 'NATIN',
719
        'NAIV--', 'NA', 'NA',
720
        'NAISE$', 'NESE', 'NEZE',
721
        'NAUGENOMM------', 'NAU ', 'NAU ',
722
        'NAUSOGUT$', 'NAUSO GUT', 'NAUZU KUT',
723
        'NCH$', 'NSH', 'NZ',
724
        'NCOISE$', 'SOA', 'ZUA',
725
        'NCOIS$', 'SOA', 'ZUA',
726
        'NDAR$', 'NDA', 'NTA',
727
        'NDERINGEN------', 'NDE ', 'NTE ',
728
        'NDRO(CDKTZ)-', 'NTRO', None,
729
        'ND(BFGJLMNPQVW)-', 'NT', None,
730
        'ND(SßZ)$', 'NS', 'NZ',
731
        'ND\'S$', 'NS', 'NZ',
732
        'ND´S$', 'NS', 'NZ',
733
        'NEBEN^^', 'NEBN', 'NEBN',
734
        'NENGELERN------', 'NEN ', 'NEN ',
735
        'NENLERN(ET)---', 'NEN LE', 'NEN LE',
736
        'NENZULERNE---', 'NEN ZU LE', 'NEN ZU LE',
737
        'NE(LMNRST)-3^', 'NE', 'NE',
738
        'NEN-3', 'NE', 'NE',
739
        'NETTE$', 'NET', 'NET',
740
        'NGU^^', 'NU', 'NU',
741
        'NG(BDFJLMNPQRTVW)-', 'NK', 'NK',
742
        'NH(AUO)-$', 'NI', 'NI',
743
        'NICHTSAHNEN-----', 'NIX ', 'NIX ',
744
        'NICHTSSAGE----', 'NIX ', 'NIX ',
745
        'NICHTS^^', 'NIX', 'NIX',
746
        'NICHT^^', 'NICHT', 'NIKT',
747
        'NINE$', 'NIN', 'NIN',
748
        'NON^^', 'NON', 'NUN',
749
        'NOTLEIDE-----^', 'NOT ', 'NUT ',
750
        'NOT^^', 'NOT', 'NUT',
751
        'NTI(AIOU)-3', 'NZI', 'NZI',
752
        'NTIEL--3', 'NZI', 'NZI',
753
        'NT(SßZ)$', 'NS', 'NZ',
754
        'NT\'S$', 'NS', 'NZ',
755
        'NT´S$', 'NS', 'NZ',
756
        'NYLON', 'NEILON', 'NEILUN',
757
        'NY9^', 'NÜ', None,
758
        'NSTZUNEH---', 'NST ZU ', 'NZT ZU ',
759
        'NSZ-', 'NS', None,
760
        'NSTS$', 'NS', 'NZ',
761
        'NZ(BDFGKLMNPQRSTVWX)-', 'NS', None,
762
        'N(SßZ)$', 'NS', None,
763
        'OBERE-', 'OBER', None,
764
        'OBER^^', 'OBA', 'UBA',
765
        'OEU2', 'Ö', 'Ö',
766
        'OE<2', 'Ö', 'Ö',
767
        'OGL-', 'OK', None,
768
        'OGNIE-', 'ONI', 'UNI',
769
        'OGN(AEOU)-$', 'ONI', 'UNI',
770
        'OH(AIOÖUÜY)-', 'OH', None,
771
        'OIE$', 'Ö', 'Ö',
772
        'OIRE$', 'OA', 'UA',
773
        'OIR$', 'OA', 'UA',
774
        'OIX', 'OA', 'UA',
775
        'OI<3', 'EU', 'EU',
776
        'OKAY^$', 'OKE', 'UKE',
777
        'OLYN$', 'OLIN', 'ULIN',
778
        'OO(DLMZ)-', 'U', None,
779
        'OO$', 'U', None,
780
        'OO-', '', '',
781
        'ORGINAL-----', 'ORI', 'URI',
782
        'OTI(AÄOÖUÜ)-', 'OZI', 'UZI',
783
        'OUI^', 'WI', 'FI',
784
        'OUILLE$', 'ULIE', 'ULIE',
785
        'OU(DT)-^', 'AU', 'AU',
786
        'OUSE$', 'AUS', 'AUZ',
787
        'OUT-', 'AU', 'AU',
788
        'OU', 'U', 'U',
789
        'O(FV)$', 'AU', 'AU',  # due to 'OW$' -> 'AU'
790
        'OVER--<', 'OW', None,
791
        'OV(AOU)-', 'OW', None,
792
        'OW$', 'AU', 'AU',
793
        'OWS$', 'OS', 'UZ',
794
        'OJ(AÄEIOÖUÜ)--', 'O', 'U',
795
        'OYER', 'OIA', None,
796
        'OY(AÄEIOÖUÜ)--', 'O', 'U',
797
        'O(JY)<', 'EU', 'EU',
798
        'OZ$', 'OS', None,
799
        'O´^', 'O', 'U',
800
        'O\'^', 'O', 'U',
801
        'O', None, 'U',
802
        'PATIEN--^', 'PAZI', 'PAZI',
803
        'PENSIO-^', 'PANSI', 'PANZI',
804
        'PE(LMNRST)-3^', 'PE', 'PE',
805
        'PFER-^', 'FE', 'FE',
806
        'P(FH)<', 'F', 'F',
807
        'PIC^$', 'PIK', 'PIK',
808
        'PIC$', 'PIZ', 'PIZ',
809
        'PIPELINE', 'PEIBLEIN', 'PEIBLEIN',
810
        'POLYP-', 'POLÜ', None,
811
        'POLY^^', 'POLI', 'PULI',
812
        'PORTRAIT7', 'PORTRE', 'PURTRE',
813
        'POWER7', 'PAUA', 'PAUA',
814
        'PP(FH)--<', 'B', 'B',
815
        'PP-', '', '',
816
        'PRODUZ-^', 'PRODU', 'BRUTU',
817
        'PRODUZI--', ' PRODU', ' BRUTU',
818
        'PRIX^$', 'PRI', 'PRI',
819
        'PS-^^', 'P', None,
820
        'P(SßZ)^', None, 'Z',
821
        'P(SßZ)$', 'BS', None,
822
        'PT-^', '', '',
823
        'PTI(AÄOÖUÜ)-3', 'BZI', 'BZI',
824
        'PY9^', 'PÜ', None,
825
        'P(AÄEIOÖRUÜY)-', 'P', 'P',
826
        'P(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'P', None,
827
        'P.^', None, 'P.',
828
        'P^', 'P', None,
829
        'P', 'B', 'B',
830
        'QI-', 'Z', 'Z',
831
        'QUARANT--', 'KARA', 'KARA',
832
        'QUE(LMNRST)-3', 'KWE', 'KFE',
833
        'QUE$', 'K', 'K',
834
        'QUI(NS)$', 'KI', 'KI',
835
        'QUIZ7', 'KWIS', None,
836
        'Q(UV)7', 'KW', 'KF',
837
        'Q<', 'K', 'K',
838
        'RADFAHR----', 'RAT ', 'RAT ',
839
        'RAEFTEZEHRE-----', 'REFTE ', 'REFTE ',
840
        'RCH', 'RCH', 'RK',
841
        'REA(DU)---3^', 'R', None,
842
        'REBSERZEUG------', 'REBS ', 'REBZ ',
843
        'RECHERCH^', 'RESHASH', 'REZAZ',
844
        'RECYCL--', 'RIZEI', 'RIZEI',
845
        'RE(ALST)-3^', 'RE', None,
846
        'REE$', 'RI', 'RI',
847
        'RER$', 'RA', 'RA',
848
        'RE(MNR)-4', 'RE', 'RE',
849
        'RETTE$', 'RET', 'RET',
850
        'REUZ$', 'REUZ', None,
851
        'REW$', 'RU', 'RU',
852
        'RH<^', 'R', 'R',
853
        'RJA(MN)--', 'RI', 'RI',
854
        'ROWD-^', 'RAU', 'RAU',
855
        'RTEMONNAIE-', 'RTMON', 'RTNUN',
856
        'RTI(AÄOÖUÜ)-3', 'RZI', 'RZI',
857
        'RTIEL--3', 'RZI', 'RZI',
858
        'RV(AEOU)-3', 'RW', None,
859
        'RY(KN)-$', 'RI', 'RI',
860
        'RY9^', 'RÜ', None,
861
        'RÄFTEZEHRE-----', 'REFTE ', 'REFTE ',
862
        'SAISO-^', 'SES', 'ZEZ',
863
        'SAFE^$', 'SEIF', 'ZEIF',
864
        'SAUCE-^', 'SOS', 'ZUZ',
865
        'SCHLAGGEBEN-----<', 'SHLAK ', 'ZLAK ',
866
        'SCHSCH---7', '', '',
867
        'SCHTSCH', 'SH', 'Z',
868
        'SC(HZ)<', 'SH', 'Z',
869
        'SC', 'SK', 'ZK',
870
        'SELBSTST--7^^', 'SELB', 'ZELB',
871
        'SELBST7^^', 'SELBST', 'ZELBZT',
872
        'SERVICE7^', 'SÖRWIS', 'ZÖRFIZ',
873
        'SERVI-^', 'SERW', None,
874
        'SE(LMNRST)-3^', 'SE', 'ZE',
875
        'SETTE$', 'SET', 'ZET',
876
        'SHP-^', 'S', 'Z',
877
        'SHST', 'SHT', 'ZT',
878
        'SHTSH', 'SH', 'Z',
879
        'SHT', 'ST', 'Z',
880
        'SHY9^', 'SHÜ', None,
881
        'SH^^', 'SH', None,
882
        'SH3', 'SH', 'Z',
883
        'SICHERGEGAN-----^', 'SICHA ', 'ZIKA ',
884
        'SICHERGEHE----^', 'SICHA ', 'ZIKA ',
885
        'SICHERGESTEL------^', 'SICHA ', 'ZIKA ',
886
        'SICHERSTELL-----^', 'SICHA ', 'ZIKA ',
887
        'SICHERZU(GS)--^', 'SICHA ZU ', 'ZIKA ZU ',
888
        'SIEGLI-^', 'SIKL', 'ZIKL',
889
        'SIGLI-^', 'SIKL', 'ZIKL',
890
        'SIGHT', 'SEIT', 'ZEIT',
891
        'SIGN', 'SEIN', 'ZEIN',
892
        'SKI(NPZ)-', 'SKI', 'ZKI',
893
        'SKI<^', 'SHI', 'ZI',
894
        'SODASS^$', 'SO DAS', 'ZU TAZ',
895
        'SODAß^$', 'SO DAS', 'ZU TAZ',
896
        'SOGENAN--^', 'SO GEN', 'ZU KEN',
897
        'SOUND-', 'SAUN', 'ZAUN',
898
        'STAATS^^', 'STAZ', 'ZTAZ',
899
        'STADT^^', 'STAT', 'ZTAT',
900
        'STANDE$', ' STANDE', ' ZTANTE',
901
        'START^^', 'START', 'ZTART',
902
        'STAURANT7', 'STORAN', 'ZTURAN',
903
        'STEAK-', 'STE', 'ZTE',
904
        'STEPHEN-^$', 'STEW', None,
905
        'STERN', 'STERN', None,
906
        'STRAF^^', 'STRAF', 'ZTRAF',
907
        'ST\'S$', 'Z', 'Z',
908
        'ST´S$', 'Z', 'Z',
909
        'STST--', '', '',
910
        'STS(ACEÈÉÊHIÌÍÎOUÄÜÖ)--', 'ST', 'ZT',
911
        'ST(SZ)', 'Z', 'Z',
912
        'SPAREN---^', 'SPA', 'ZPA',
913
        'SPAREND----', ' SPA', ' ZPA',
914
        'S(PTW)-^^', 'S', None,
915
        'SP', 'SP', None,
916
        'STYN(AE)-$', 'STIN', 'ZTIN',
917
        'ST', 'ST', 'ZT',
918
        'SUITE<', 'SIUT', 'ZIUT',
919
        'SUKE--$', 'S', 'Z',
920
        'SURF(EI)-', 'SÖRF', 'ZÖRF',
921
        'SV(AEÈÉÊIÌÍÎOU)-<^', 'SW', None,
922
        'SYB(IY)--^', 'SIB', None,
923
        'SYL(KVW)--^', 'SI', None,
924
        'SY9^', 'SÜ', None,
925
        'SZE(NPT)-^', 'ZE', 'ZE',
926
        'SZI(ELN)-^', 'ZI', 'ZI',
927
        'SZCZ<', 'SH', 'Z',
928
        'SZT<', 'ST', 'ZT',
929
        'SZ<3', 'SH', 'Z',
930
        'SÜL(KVW)--^', 'SI', None,
931
        'S', None, 'Z',
932
        'TCH', 'SH', 'Z',
933
        'TD(AÄEIOÖRUÜY)-', 'T', None,
934
        'TD(ÀÁÂÃÅÈÉÊËÌÍÎÏÒÓÔÕØÙÚÛÝŸ)-', 'T', None,
935
        'TEAT-^', 'TEA', 'TEA',
936
        'TERRAI7^', 'TERA', 'TERA',
937
        'TE(LMNRST)-3^', 'TE', 'TE',
938
        'TH<', 'T', 'T',
939
        'TICHT-', 'TIK', 'TIK',
940
        'TICH$', 'TIK', 'TIK',
941
        'TIC$', 'TIZ', 'TIZ',
942
        'TIGGESTELL-------', 'TIK ', 'TIK ',
943
        'TIGSTELL-----', 'TIK ', 'TIK ',
944
        'TOAS-^', 'TO', 'TU',
945
        'TOILET-', 'TOLE', 'TULE',
946
        'TOIN-', 'TOA', 'TUA',
947
        'TRAECHTI-^', 'TRECHT', 'TREKT',
948
        'TRAECHTIG--', ' TRECHT', ' TREKT',
949
        'TRAINI-', 'TREN', 'TREN',
950
        'TRÄCHTI-^', 'TRECHT', 'TREKT',
951
        'TRÄCHTIG--', ' TRECHT', ' TREKT',
952
        'TSCH', 'SH', 'Z',
953
        'TSH', 'SH', 'Z',
954
        'TST', 'ZT', 'ZT',
955
        'T(Sß)', 'Z', 'Z',
956
        'TT(SZ)--<', '', '',
957
        'TT9', 'T', 'T',
958
        'TV^$', 'TV', 'TV',
959
        'TX(AEIOU)-3', 'SH', 'Z',
960
        'TY9^', 'TÜ', None,
961
        'TZ-', '', '',
962
        'T\'S3$', 'Z', 'Z',
963
        'T´S3$', 'Z', 'Z',
964
        'UEBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
965
        'UEBER^^', 'ÜBA', 'IBA',
966
        'UE2', 'Ü', 'I',
967
        'UGL-', 'UK', None,
968
        'UH(AOÖUÜY)-', 'UH', None,
969
        'UIE$', 'Ü', 'I',
970
        'UM^^', 'UM', 'UN',
971
        'UNTERE--3', 'UNTE', 'UNTE',
972
        'UNTER^^', 'UNTA', 'UNTA',
973
        'UNVER^^', 'UNFA', 'UNFA',
974
        'UN^^', 'UN', 'UN',
975
        'UTI(AÄOÖUÜ)-', 'UZI', 'UZI',
976
        'UVE-4', 'UW', None,
977
        'UY2', 'UI', None,
978
        'UZZ', 'AS', 'AZ',
979
        'VACL-^', 'WAZ', 'FAZ',
980
        'VAC$', 'WAZ', 'FAZ',
981
        'VAN DEN ^', 'FANDN', 'FANTN',
982
        'VANES-^', 'WANE', None,
983
        'VATRO-', 'WATR', None,
984
        'VA(DHJNT)--^', 'F', None,
985
        'VEDD-^', 'FE', 'FE',
986
        'VE(BEHIU)--^', 'F', None,
987
        'VEL(BDLMNT)-^', 'FEL', None,
988
        'VENTZ-^', 'FEN', None,
989
        'VEN(NRSZ)-^', 'FEN', None,
990
        'VER(AB)-^$', 'WER', None,
991
        'VERBAL^$', 'WERBAL', None,
992
        'VERBAL(EINS)-^', 'WERBAL', None,
993
        'VERTEBR--', 'WERTE', None,
994
        'VEREIN-----', 'F', None,
995
        'VEREN(AEIOU)-^', 'WEREN', None,
996
        'VERIFI', 'WERIFI', None,
997
        'VERON(AEIOU)-^', 'WERON', None,
998
        'VERSEN^', 'FERSN', 'FAZN',
999
        'VERSIERT--^', 'WERSI', None,
1000
        'VERSIO--^', 'WERS', None,
1001
        'VERSUS', 'WERSUS', None,
1002
        'VERTI(GK)-', 'WERTI', None,
1003
        'VER^^', 'FER', 'FA',
1004
        'VERSPRECHE-------', ' FER', ' FA',
1005
        'VER$', 'WA', None,
1006
        'VER', 'FA', 'FA',
1007
        'VET(HT)-^', 'FET', 'FET',
1008
        'VETTE$', 'WET', 'FET',
1009
        'VE^', 'WE', None,
1010
        'VIC$', 'WIZ', 'FIZ',
1011
        'VIELSAGE----', 'FIL ', 'FIL ',
1012
        'VIEL', 'FIL', 'FIL',
1013
        'VIEW', 'WIU', 'FIU',
1014
        'VILL(AE)-', 'WIL', None,
1015
        'VIS(ACEIKUVWZ)-<^', 'WIS', None,
1016
        'VI(ELS)--^', 'F', None,
1017
        'VILLON--', 'WILI', 'FILI',
1018
        'VIZE^^', 'FIZE', 'FIZE',
1019
        'VLIE--^', 'FL', None,
1020
        'VL(AEIOU)--', 'W', None,
1021
        'VOKA-^', 'WOK', None,
1022
        'VOL(ATUVW)--^', 'WO', None,
1023
        'VOR^^', 'FOR', 'FUR',
1024
        'VR(AEIOU)--', 'W', None,
1025
        'VV9', 'W', None,
1026
        'VY9^', 'WÜ', 'FI',
1027
        'V(ÜY)-', 'W', None,
1028
        'V(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'W', None,
1029
        'V(AEIJLRU)-<', 'W', None,
1030
        'V.^', 'V.', None,
1031
        'V<', 'F', 'F',
1032
        'WEITERENTWI-----^', 'WEITA ', 'FEITA ',
1033
        'WEITREICH-----^', 'WEIT ', 'FEIT ',
1034
        'WEITVER^', 'WEIT FER', 'FEIT FA',
1035
        'WE(LMNRST)-3^', 'WE', 'FE',
1036
        'WER(DST)-', 'WER', None,
1037
        'WIC$', 'WIZ', 'FIZ',
1038
        'WIEDERU--', 'WIDE', 'FITE',
1039
        'WIEDER^$', 'WIDA', 'FITA',
1040
        'WIEDER^^', 'WIDA ', 'FITA ',
1041
        'WIEVIEL', 'WI FIL', 'FI FIL',
1042
        'WISUEL', 'WISUEL', None,
1043
        'WR-^', 'W', None,
1044
        'WY9^', 'WÜ', 'FI',
1045
        'W(BDFGJKLMNPQRSTZ)-', 'F', None,
1046
        'W$', 'F', None,
1047
        'W', None, 'F',
1048
        'X<^', 'Z', 'Z',
1049
        'XHAVEN$', 'XAFN', None,
1050
        'X(CSZ)', 'X', 'X',
1051
        'XTS(CH)--', 'XT', 'XT',
1052
        'XT(SZ)', 'Z', 'Z',
1053
        'YE(LMNRST)-3^', 'IE', 'IE',
1054
        'YE-3', 'I', 'I',
1055
        'YOR(GK)^$', 'IÖRK', 'IÖRK',
1056
        'Y(AOU)-<7', 'I', 'I',
1057
        'Y(BKLMNPRSTX)-1', 'Ü', None,
1058
        'YVES^$', 'IF', 'IF',
1059
        'YVONNE^$', 'IWON', 'IFUN',
1060
        'Y.^', 'Y.', None,
1061
        'Y', 'I', 'I',
1062
        'ZC(AOU)-', 'SK', 'ZK',
1063
        'ZE(LMNRST)-3^', 'ZE', 'ZE',
1064
        'ZIEJ$', 'ZI', 'ZI',
1065
        'ZIGERJA(HR)-3', 'ZIGA IA', 'ZIKA IA',
1066
        'ZL(AEIOU)-', 'SL', None,
1067
        'ZS(CHT)--', '', '',
1068
        'ZS', 'SH', 'Z',
1069
        'ZUERST', 'ZUERST', 'ZUERST',
1070
        'ZUGRUNDE^$', 'ZU GRUNDE', 'ZU KRUNTE',
1071
        'ZUGRUNDE', 'ZU GRUNDE ', 'ZU KRUNTE ',
1072
        'ZUGUNSTEN', 'ZU GUNSTN', 'ZU KUNZTN',
1073
        'ZUHAUSE-', 'ZU HAUS', 'ZU AUZ',
1074
        'ZULASTEN^$', 'ZU LASTN', 'ZU LAZTN',
1075
        'ZURUECK^^', 'ZURÜK', 'ZURIK',
1076
        'ZURZEIT', 'ZUR ZEIT', 'ZUR ZEIT',
1077
        'ZURÜCK^^', 'ZURÜK', 'ZURIK',
1078
        'ZUSTANDE', 'ZU STANDE', 'ZU ZTANTE',
1079
        'ZUTAGE', 'ZU TAGE', 'ZU TAKE',
1080
        'ZUVER^^', 'ZUFA', 'ZUFA',
1081
        'ZUVIEL', 'ZU FIL', 'ZU FIL',
1082
        'ZUWENIG', 'ZU WENIK', 'ZU FENIK',
1083
        'ZY9^', 'ZÜ', None,
1084
        'ZYK3$', 'ZIK', None,
1085
        'Z(VW)7^', 'SW', None,
1086
        None, None, None
1087
        # fmt: on
1088
    )
1089
1090 1
    phonet_hash = Counter()
1091 1
    alpha_pos = Counter()
1092
1093 1
    phonet_hash_1 = Counter()
1094 1
    phonet_hash_2 = Counter()
1095
1096 1
    _phonet_upper_translation = dict(
1097
        zip(
1098
            (
1099
                ord(_)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
1100
                for _ in 'abcdefghijklmnopqrstuvwxyzàáâãåäæ'
1101
                + 'çðèéêëìíîïñòóôõöøœšßþùúûüýÿ'
1102
            ),
1103
            'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÅÄÆ'
1104
            + 'ÇÐÈÉÊËÌÍÎÏÑÒÓÔÕÖØŒŠßÞÙÚÛÜÝŸ',
1105
        )
1106
    )
1107
1108 1
    def _initialize_phonet(lang):
1109
        """Initialize phonet variables."""
1110 1
        if lang == 'none':
1111 1
            _phonet_rules = _phonet_rules_no_lang
1112
        else:
1113 1
            _phonet_rules = _phonet_rules_german
1114
1115 1
        phonet_hash[''] = -1
1116
1117
        # German and international umlauts
1118 1
        for j in {
1119
            'À',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1120
            'Á',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1121
            'Â',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1122
            'Ã',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1123
            'Ä',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1124
            'Å',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1125
            'Æ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1126
            'Ç',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1127
            'È',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1128
            'É',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1129
            'Ê',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1130
            'Ë',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1131
            'Ì',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1132
            'Í',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1133
            'Î',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1134
            'Ï',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1135
            'Ð',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1136
            'Ñ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1137
            'Ò',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1138
            'Ó',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1139
            'Ô',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1140
            'Õ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1141
            'Ö',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1142
            'Ø',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1143
            'Ù',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1144
            'Ú',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1145
            'Û',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1146
            'Ü',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1147
            'Ý',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1148
            'Þ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1149
            'ß',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1150
            'Œ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1151
            'Š',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1152
            'Ÿ',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1153
        }:
1154 1
            alpha_pos[j] = 1
1155 1
            phonet_hash[j] = -1
1156
1157
        # "normal" letters ('A'-'Z')
1158 1
        for i, j in enumerate('ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
1159 1
            alpha_pos[j] = i + 2
1160 1
            phonet_hash[j] = -1
1161
1162 1
        for i in range(26):
1163 1
            for j in range(28):
1164 1
                phonet_hash_1[i, j] = -1
1165 1
                phonet_hash_2[i, j] = -1
1166
1167
        # for each phonetc rule
1168 1
        for i in range(len(_phonet_rules)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
1169 1
            rule = _phonet_rules[i]
1170
1171 1
            if rule and i % 3 == 0:
1172
                # calculate first hash value
1173 1
                k = _phonet_rules[i][0]
1174
1175 1
                if phonet_hash[k] < 0 and (
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash does not seem to be defined.
Loading history...
1176
                    _phonet_rules[i + 1] or _phonet_rules[i + 2]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1177
                ):
1178 1
                    phonet_hash[k] = i
1179
1180
                # calculate second hash values
1181 1
                if k and alpha_pos[k] >= 2:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable alpha_pos does not seem to be defined.
Loading history...
1182 1
                    k = alpha_pos[k]
1183
1184 1
                    j = k - 2
1185 1
                    rule = rule[1:]
1186
1187 1
                    if not rule:
1188 1
                        rule = ' '
1189 1
                    elif rule[0] == '(':
1190 1
                        rule = rule[1:]
1191
                    else:
1192 1
                        rule = rule[0]
1193
1194 1
                    while rule and (rule[0] != ')'):
1195 1
                        k = alpha_pos[rule[0]]
1196
1197 1
                        if k > 0:
1198
                            # add hash value for this letter
1199 1
                            if phonet_hash_1[j, k] < 0:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash_1 does not seem to be defined.
Loading history...
1200 1
                                phonet_hash_1[j, k] = i
1201 1
                                phonet_hash_2[j, k] = i
1202
1203 1
                            if phonet_hash_2[j, k] >= (i - 30):
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash_2 does not seem to be defined.
Loading history...
1204 1
                                phonet_hash_2[j, k] = i
1205
                            else:
1206 1
                                k = -1
1207
1208 1
                        if k <= 0:
1209
                            # add hash value for all letters
1210 1
                            if phonet_hash_1[j, 0] < 0:
1211 1
                                phonet_hash_1[j, 0] = i
1212
1213 1
                            phonet_hash_2[j, 0] = i
1214
1215 1
                        rule = rule[1:]
1216
1217 1
    def _phonet(term, mode, lang):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (29/15).
Loading history...
1218
        """Return the phonet coded form of a term."""
1219 1
        if lang == 'none':
1220 1
            _phonet_rules = _phonet_rules_no_lang
1221
        else:
1222 1
            _phonet_rules = _phonet_rules_german
1223
1224 1
        char0 = ''
1225 1
        dest = term
1226
1227 1
        if not term:
1228 1
            return ''
1229
1230 1
        term_length = len(term)
1231
1232
        # convert input string to upper-case
1233 1
        src = term.translate(_phonet_upper_translation)
1234
1235
        # check "src"
1236 1
        i = 0
1237 1
        j = 0
1238 1
        zeta = 0
1239
1240 1
        while i < len(src):
0 ignored issues
show
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
unused-code introduced by
Too many nested blocks (8/5)
Loading history...
unused-code introduced by
Too many nested blocks (9/5)
Loading history...
unused-code introduced by
Too many nested blocks (7/5)
Loading history...
1241 1
            char = src[i]
1242
1243 1
            pos = alpha_pos[char]
1244
1245 1
            if pos >= 2:
1246 1
                xpos = pos - 2
1247
1248 1
                if i + 1 == len(src):
1249 1
                    pos = alpha_pos['']
1250
                else:
1251 1
                    pos = alpha_pos[src[i + 1]]
1252
1253 1
                start1 = phonet_hash_1[xpos, pos]
1254 1
                start2 = phonet_hash_1[xpos, 0]
1255 1
                end1 = phonet_hash_2[xpos, pos]
1256 1
                end2 = phonet_hash_2[xpos, 0]
1257
1258
                # preserve rule priorities
1259 1
                if (start2 >= 0) and ((start1 < 0) or (start2 < start1)):
1260 1
                    pos = start1
1261 1
                    start1 = start2
1262 1
                    start2 = pos
1263 1
                    pos = end1
1264 1
                    end1 = end2
1265 1
                    end2 = pos
1266
1267 1
                if (end1 >= start2) and (start2 >= 0):
1268 1
                    if end2 > end1:
1269 1
                        end1 = end2
1270
1271 1
                    start2 = -1
1272 1
                    end2 = -1
1273
            else:
1274 1
                pos = phonet_hash[char]
1275 1
                start1 = pos
1276 1
                end1 = 10000
1277 1
                start2 = -1
1278 1
                end2 = -1
1279
1280 1
            pos = start1
1281 1
            zeta0 = 0
1282
1283 1
            if pos >= 0:
1284
                # check rules for this char
1285 1
                while (_phonet_rules[pos] is None) or (
1286
                    _phonet_rules[pos][0] == char
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1287
                ):
1288 1
                    if pos > end1:
1289 1
                        if start2 > 0:
1290 1
                            pos = start2
1291 1
                            start1 = start2
1292 1
                            start2 = -1
1293 1
                            end1 = end2
1294 1
                            end2 = -1
1295 1
                            continue
1296
1297 1
                        break
1298
1299 1
                    if (_phonet_rules[pos] is None) or (
1300
                        _phonet_rules[pos + mode] is None
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1301
                    ):
1302
                        # no conversion rule available
1303 1
                        pos += 3
1304 1
                        continue
1305
1306
                    # check whole string
1307 1
                    matches = 1  # number of matching letters
1308 1
                    priority = 5  # default priority
1309 1
                    rule = _phonet_rules[pos]
1310 1
                    rule = rule[1:]
1311
1312 1
                    while (
1313
                        rule
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1314
                        and (len(src) > (i + matches))
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1315
                        and (src[i + matches] == rule[0])
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1316
                        and not rule[0].isdigit()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1317
                        and (rule not in '(-<^$')
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1318
                    ):
1319 1
                        matches += 1
1320 1
                        rule = rule[1:]
1321
1322 1
                    if rule and (rule[0] == '('):
1323
                        # check an array of letters
1324 1
                        if (
1325
                            (len(src) > (i + matches))
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1326
                            and src[i + matches].isalpha()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1327
                            and (src[i + matches] in rule[1:])
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1328
                        ):
1329 1
                            matches += 1
1330
1331 1
                            while rule and rule[0] != ')':
1332 1
                                rule = rule[1:]
1333
1334
                            # if rule[0] == ')':
1335 1
                            rule = rule[1:]
1336
1337 1
                    if rule:
1338 1
                        priority0 = ord(rule[0])
1339
                    else:
1340 1
                        priority0 = 0
1341
1342 1
                    matches0 = matches
1343
1344 1
                    while rule and rule[0] == '-' and matches > 1:
1345 1
                        matches -= 1
1346 1
                        rule = rule[1:]
1347
1348 1
                    if rule and rule[0] == '<':
1349 1
                        rule = rule[1:]
1350
1351 1
                    if rule and rule[0].isdigit():
1352
                        # read priority
1353 1
                        priority = int(rule[0])
1354 1
                        rule = rule[1:]
1355
1356 1
                    if rule and rule[0:2] == '^^':
1357 1
                        rule = rule[1:]
1358
1359 1
                    if (
1360
                        not rule
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
best-practice introduced by
Too many boolean expressions in if statement (12/5)
Loading history...
1361
                        or (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1362
                            (rule[0] == '^')
1363
                            and ((i == 0) or not src[i - 1].isalpha())
1364
                            and (
1365
                                (rule[1:2] != '$')
1366
                                or (
1367
                                    not (
1368
                                        src[
1369
                                            i + matches0 : i + matches0 + 1
1370
                                        ].isalpha()
1371
                                    )
1372
                                    and (
1373
                                        src[i + matches0 : i + matches0 + 1]
1374
                                        != '.'
1375
                                    )
1376
                                )
1377
                            )
1378
                        )
1379
                        or (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1380
                            (rule[0] == '$')
1381
                            and (i > 0)
1382
                            and src[i - 1].isalpha()
1383
                            and (
1384
                                (
1385
                                    not src[
1386
                                        i + matches0 : i + matches0 + 1
1387
                                    ].isalpha()
1388
                                )
1389
                                and (
1390
                                    src[i + matches0 : i + matches0 + 1] != '.'
1391
                                )
1392
                            )
1393
                        )
1394
                    ):
1395
                        # look for continuation, if:
1396
                        # matches > 1 und NO '-' in first string */
1397 1
                        pos0 = -1
1398
1399 1
                        start3 = 0
1400 1
                        start4 = 0
1401 1
                        end3 = 0
1402 1
                        end4 = 0
1403
1404 1
                        if (
1405
                            (matches > 1)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1406
                            and src[i + matches : i + matches + 1]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1407
                            and (priority0 != ord('-'))
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1408
                        ):
1409 1
                            char0 = src[i + matches - 1]
1410 1
                            pos0 = alpha_pos[char0]
1411
1412 1
                            if pos0 >= 2 and src[i + matches]:
1413 1
                                xpos = pos0 - 2
1414 1
                                pos0 = alpha_pos[src[i + matches]]
1415 1
                                start3 = phonet_hash_1[xpos, pos0]
1416 1
                                start4 = phonet_hash_1[xpos, 0]
1417 1
                                end3 = phonet_hash_2[xpos, pos0]
1418 1
                                end4 = phonet_hash_2[xpos, 0]
1419
1420
                                # preserve rule priorities
1421 1
                                if (start4 >= 0) and (
1422
                                    (start3 < 0) or (start4 < start3)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1423
                                ):
1424 1
                                    pos0 = start3
1425 1
                                    start3 = start4
1426 1
                                    start4 = pos0
1427 1
                                    pos0 = end3
1428 1
                                    end3 = end4
1429 1
                                    end4 = pos0
1430
1431 1
                                if (end3 >= start4) and (start4 >= 0):
1432 1
                                    if end4 > end3:
1433 1
                                        end3 = end4
1434
1435 1
                                    start4 = -1
1436 1
                                    end4 = -1
1437
                            else:
1438 1
                                pos0 = phonet_hash[char0]
1439 1
                                start3 = pos0
1440 1
                                end3 = 10000
1441 1
                                start4 = -1
1442 1
                                end4 = -1
1443
1444 1
                            pos0 = start3
1445
1446
                        # check continuation rules for src[i+matches]
1447 1
                        if pos0 >= 0:
1448 1
                            while (_phonet_rules[pos0] is None) or (
1449
                                _phonet_rules[pos0][0] == char0
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
introduced by
Value '_phonet_rules[pos0]' is unsubscriptable
Loading history...
1450
                            ):
1451 1
                                if pos0 > end3:
1452 1
                                    if start4 > 0:
1453 1
                                        pos0 = start4
1454 1
                                        start3 = start4
1455 1
                                        start4 = -1
1456 1
                                        end3 = end4
1457 1
                                        end4 = -1
1458 1
                                        continue
1459
1460 1
                                    priority0 = -1
1461
1462
                                    # important
1463 1
                                    break
1464
1465 1
                                if (_phonet_rules[pos0] is None) or (
1466
                                    _phonet_rules[pos0 + mode] is None
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1467
                                ):
1468
                                    # no conversion rule available
1469 1
                                    pos0 += 3
1470 1
                                    continue
1471
1472
                                # check whole string
1473 1
                                matches0 = matches
1474 1
                                priority0 = 5
1475 1
                                rule = _phonet_rules[pos0]
1476 1
                                rule = rule[1:]
0 ignored issues
show
introduced by
Value 'rule' is unsubscriptable
Loading history...
1477
1478 1
                                while (
1479
                                    rule
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1480
                                    and (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1481
                                        src[i + matches0 : i + matches0 + 1]
1482
                                        == rule[0]
1483
                                    )
1484
                                    and (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1485
                                        not rule[0].isdigit()
1486
                                        or (rule in '(-<^$')
1487
                                    )
1488
                                ):
1489 1
                                    matches0 += 1
1490 1
                                    rule = rule[1:]
1491
1492 1
                                if rule and rule[0] == '(':
1493
                                    # check an array of letters
1494 1
                                    if src[
1495
                                        i + matches0 : i + matches0 + 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1496
                                    ].isalpha() and (
1497
                                        src[i + matches0] in rule[1:]
1498
                                    ):
1499 1
                                        matches0 += 1
1500
1501 1
                                        while rule and rule[0] != ')':
1502 1
                                            rule = rule[1:]
1503
1504
                                        # if rule[0] == ')':
1505 1
                                        rule = rule[1:]
1506
1507 1
                                while rule and rule[0] == '-':
1508
                                    # "matches0" is NOT decremented
1509
                                    # because of  "if (matches0 == matches)"
1510 1
                                    rule = rule[1:]
1511
1512 1
                                if rule and rule[0] == '<':
1513 1
                                    rule = rule[1:]
1514
1515 1
                                if rule and rule[0].isdigit():
1516 1
                                    priority0 = int(rule[0])
1517 1
                                    rule = rule[1:]
1518
1519 1
                                if (
1520
                                    not rule
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1521
                                    or
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1522
                                    # rule == '^' is not possible here
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1523
                                    (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1524
                                        (rule[0] == '$')
1525
                                        and not src[
1526
                                            i + matches0 : i + matches0 + 1
1527
                                        ].isalpha()
1528
                                        and (
1529
                                            src[
1530
                                                i + matches0 : i + matches0 + 1
1531
                                            ]
1532
                                            != '.'
1533
                                        )
1534
                                    )
1535
                                ):
1536 1
                                    if matches0 == matches:
1537
                                        # this is only a partial string
1538 1
                                        pos0 += 3
1539 1
                                        continue
1540
1541 1
                                    if priority0 < priority:
1542
                                        # priority is too low
1543 1
                                        pos0 += 3
1544 1
                                        continue
1545
1546
                                    # continuation rule found
1547 1
                                    break
1548
1549 1
                                pos0 += 3
1550
1551
                            # end of "while"
1552 1
                            if (priority0 >= priority) and (
1553
                                (_phonet_rules[pos0] is not None)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1554
                                and (_phonet_rules[pos0][0] == char0)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
introduced by
Value '_phonet_rules[pos0]' is unsubscriptable
Loading history...
1555
                            ):
1556
1557 1
                                pos += 3
1558 1
                                continue
1559
1560
                        # replace string
1561 1
                        if _phonet_rules[pos] and (
1562
                            '<' in _phonet_rules[pos][1:]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1563
                        ):
1564 1
                            priority0 = 1
1565
                        else:
1566 1
                            priority0 = 0
1567
1568 1
                        rule = _phonet_rules[pos + mode]
1569
1570 1
                        if (priority0 == 1) and (zeta == 0):
1571
                            # rule with '<' is applied
1572 1
                            if (
1573
                                (j > 0)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1574
                                and rule
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1575
                                and (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1576
                                    (dest[j - 1] == char)
1577
                                    or (dest[j - 1] == rule[0])
1578
                                )
1579
                            ):
1580 1
                                j -= 1
1581
1582 1
                            zeta0 = 1
1583 1
                            zeta += 1
1584 1
                            matches0 = 0
1585
1586 1
                            while rule and src[i + matches0]:
1587 1
                                src = (
1588
                                    src[0 : i + matches0]
1589
                                    + rule[0]
1590
                                    + src[i + matches0 + 1 :]
1591
                                )
1592 1
                                matches0 += 1
1593 1
                                rule = rule[1:]
1594
1595 1
                            if matches0 < matches:
1596 1
                                src = (
1597
                                    src[0 : i + matches0] + src[i + matches :]
1598
                                )
1599
1600 1
                            char = src[i]
1601
                        else:
1602 1
                            i = i + matches - 1
1603 1
                            zeta = 0
1604
1605 1
                            while len(rule) > 1:
1606 1
                                if (j == 0) or (dest[j - 1] != rule[0]):
1607 1
                                    dest = (
1608
                                        dest[0:j]
1609
                                        + rule[0]
1610
                                        + dest[min(len(dest), j + 1) :]
1611
                                    )
1612 1
                                    j += 1
1613
1614 1
                                rule = rule[1:]
1615
1616
                            # new "current char"
1617 1
                            if not rule:
1618 1
                                rule = ''
1619 1
                                char = ''
1620
                            else:
1621 1
                                char = rule[0]
1622
1623 1
                            if (
1624
                                _phonet_rules[pos]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1625
                                and '^^' in _phonet_rules[pos][1:]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
1626
                            ):
1627 1
                                if char:
1628 1
                                    dest = (
1629
                                        dest[0:j]
1630
                                        + char
1631
                                        + dest[min(len(dest), j + 1) :]
1632
                                    )
1633 1
                                    j += 1
1634
1635 1
                                src = src[i + 1 :]
1636 1
                                i = 0
1637 1
                                zeta0 = 1
1638
1639 1
                        break
1640
1641 1
                    pos += 3
1642
1643 1
                    if pos > end1 and start2 > 0:
1644 1
                        pos = start2
1645 1
                        start1 = start2
1646 1
                        end1 = end2
1647 1
                        start2 = -1
1648 1
                        end2 = -1
1649
1650 1
            if zeta0 == 0:
1651 1
                if char and ((j == 0) or (dest[j - 1] != char)):
1652
                    # delete multiple letters only
1653 1
                    dest = dest[0:j] + char + dest[min(j + 1, term_length) :]
1654 1
                    j += 1
1655
1656 1
                i += 1
1657 1
                zeta = 0
1658
1659 1
        dest = dest[0:j]
1660
1661 1
        return dest
1662
1663 1
    _initialize_phonet(lang)
1664
1665 1
    word = unicode_normalize('NFKC', text_type(word))
1666 1
    return _phonet(word, mode, lang)
1667
1668
1669
if __name__ == '__main__':
1670
    import doctest
1671
1672
    doctest.testmod()
1673