Test Failed
Push — master ( 64abe2...a464fa )
by Chris
04:02 queued 11s
created

abydos.phonetic.phonet.phonet()   F

Complexity

Conditions 142

Size

Total Lines 1512
Code Lines 1315

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 142
eloc 1315
nop 3
dl 0
loc 1512
rs 0
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like abydos.phonetic.phonet.phonet() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.phonetic.phonet.
20
21
The phonetic.phonet module implements phonet algorithm (a.k.a. Hannoveraner
22
Phonetik), intended chiefly for German.
23
"""
24
25
from __future__ import unicode_literals
26
27
from collections import Counter
28
from unicodedata import normalize as unicode_normalize
29
30
from six import text_type
31
from six.moves import range
32
33
__all__ = ['phonet']
34
35
36
def phonet(word, mode=1, lang='de'):
37
    """Return the phonet code for a word.
38
39
    phonet ("Hannoveraner Phonetik") was developed by Jörg Michael and
40
    documented in :cite:`Michael:1999`.
41
42
    This is a port of Jesper Zedlitz's code, which is licensed LGPL
43
    :cite:`Zedlitz:2015`.
44
45
    That is, in turn, based on Michael's C code, which is also licensed LGPL
46
    :cite:`Michael:2007`.
47
48
    :param str word: the word to transform
49
    :param int mode: the ponet variant to employ (1 or 2)
50
    :param str lang: 'de' (default) for German
51
            'none' for no language
52
    :returns: the phonet value
53
    :rtype: str
54
55
    >>> phonet('Christopher')
56
    'KRISTOFA'
57
    >>> phonet('Niall')
58
    'NIAL'
59
    >>> phonet('Smith')
60
    'SMIT'
61
    >>> phonet('Schmidt')
62
    'SHMIT'
63
64
    >>> phonet('Christopher', mode=2)
65
    'KRIZTUFA'
66
    >>> phonet('Niall', mode=2)
67
    'NIAL'
68
    >>> phonet('Smith', mode=2)
69
    'ZNIT'
70
    >>> phonet('Schmidt', mode=2)
71
    'ZNIT'
72
73
    >>> phonet('Christopher', lang='none')
74
    'CHRISTOPHER'
75
    >>> phonet('Niall', lang='none')
76
    'NIAL'
77
    >>> phonet('Smith', lang='none')
78
    'SMITH'
79
    >>> phonet('Schmidt', lang='none')
80
    'SCHMIDT'
81
    """
82
    _phonet_rules_no_lang = (  # separator chars
83
        '´', ' ', ' ',
84
        '"', ' ', ' ',
85
        '`$', '', '',
86
        '\'', ' ', ' ',
87
        ',', ',', ',',
88
        ';', ',', ',',
89
        '-', ' ', ' ',
90
        ' ', ' ', ' ',
91
        '.', '.', '.',
92
        ':', '.', '.',
93
        # German umlauts
94
        'Ä', 'AE', 'AE',
95
        'Ö', 'OE', 'OE',
96
        'Ü', 'UE', 'UE',
97
        'ß', 'S', 'S',
98
        # international umlauts
99
        'À', 'A', 'A',
100
        'Á', 'A', 'A',
101
        'Â', 'A', 'A',
102
        'Ã', 'A', 'A',
103
        'Å', 'A', 'A',
104
        'Æ', 'AE', 'AE',
105
        'Ç', 'C', 'C',
106
        'Ð', 'DJ', 'DJ',
107
        'È', 'E', 'E',
108
        'É', 'E', 'E',
109
        'Ê', 'E', 'E',
110
        'Ë', 'E', 'E',
111
        'Ì', 'I', 'I',
112
        'Í', 'I', 'I',
113
        'Î', 'I', 'I',
114
        'Ï', 'I', 'I',
115
        'Ñ', 'NH', 'NH',
116
        'Ò', 'O', 'O',
117
        'Ó', 'O', 'O',
118
        'Ô', 'O', 'O',
119
        'Õ', 'O', 'O',
120
        'Œ', 'OE', 'OE',
121
        'Ø', 'OE', 'OE',
122
        'Š', 'SH', 'SH',
123
        'Þ', 'TH', 'TH',
124
        'Ù', 'U', 'U',
125
        'Ú', 'U', 'U',
126
        'Û', 'U', 'U',
127
        'Ý', 'Y', 'Y',
128
        'Ÿ', 'Y', 'Y',
129
        # 'normal' letters (A-Z)
130
        'MC^', 'MAC', 'MAC',
131
        'MC^', 'MAC', 'MAC',
132
        'M´^', 'MAC', 'MAC',
133
        'M\'^', 'MAC', 'MAC',
134
        'O´^', 'O', 'O',
135
        'O\'^', 'O', 'O',
136
        'VAN DEN ^', 'VANDEN', 'VANDEN',
137
        None, None, None)
138
139
    _phonet_rules_german = (  # separator chars
140
        '´', ' ', ' ',
141
        '"', ' ', ' ',
142
        '`$', '', '',
143
        '\'', ' ', ' ',
144
        ',', ' ', ' ',
145
        ';', ' ', ' ',
146
        '-', ' ', ' ',
147
        ' ', ' ', ' ',
148
        '.', '.', '.',
149
        ':', '.', '.',
150
        # German umlauts
151
        'ÄE', 'E', 'E',
152
        'ÄU<', 'EU', 'EU',
153
        'ÄV(AEOU)-<', 'EW', None,
154
        'Ä$', 'Ä', None,
155
        'Ä<', None, 'E',
156
        'Ä', 'E', None,
157
        'ÖE', 'Ö', 'Ö',
158
        'ÖU', 'Ö', 'Ö',
159
        'ÖVER--<', 'ÖW', None,
160
        'ÖV(AOU)-', 'ÖW', None,
161
        'ÜBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
162
        'ÜBER^^', 'ÜBA', 'IBA',
163
        'ÜE', 'Ü', 'I',
164
        'ÜVER--<', 'ÜW', None,
165
        'ÜV(AOU)-', 'ÜW', None,
166
        'Ü', None, 'I',
167
        'ßCH<', None, 'Z',
168
        'ß<', 'S', 'Z',
169
        # international umlauts
170
        'À<', 'A', 'A',
171
        'Á<', 'A', 'A',
172
        'Â<', 'A', 'A',
173
        'Ã<', 'A', 'A',
174
        'Å<', 'A', 'A',
175
        'ÆER-', 'E', 'E',
176
        'ÆU<', 'EU', 'EU',
177
        'ÆV(AEOU)-<', 'EW', None,
178
        'Æ$', 'Ä', None,
179
        'Æ<', None, 'E',
180
        'Æ', 'E', None,
181
        'Ç', 'Z', 'Z',
182
        'ÐÐ-', '', '',
183
        'Ð', 'DI', 'TI',
184
        'È<', 'E', 'E',
185
        'É<', 'E', 'E',
186
        'Ê<', 'E', 'E',
187
        'Ë', 'E', 'E',
188
        'Ì<', 'I', 'I',
189
        'Í<', 'I', 'I',
190
        'Î<', 'I', 'I',
191
        'Ï', 'I', 'I',
192
        'ÑÑ-', '', '',
193
        'Ñ', 'NI', 'NI',
194
        'Ò<', 'O', 'U',
195
        'Ó<', 'O', 'U',
196
        'Ô<', 'O', 'U',
197
        'Õ<', 'O', 'U',
198
        'Œ<', 'Ö', 'Ö',
199
        'Ø(IJY)-<', 'E', 'E',
200
        'Ø<', 'Ö', 'Ö',
201
        'Š', 'SH', 'Z',
202
        'Þ', 'T', 'T',
203
        'Ù<', 'U', 'U',
204
        'Ú<', 'U', 'U',
205
        'Û<', 'U', 'U',
206
        'Ý<', 'I', 'I',
207
        'Ÿ<', 'I', 'I',
208
        # 'normal' letters (A-Z)
209
        'ABELLE$', 'ABL', 'ABL',
210
        'ABELL$', 'ABL', 'ABL',
211
        'ABIENNE$', 'ABIN', 'ABIN',
212
        'ACHME---^', 'ACH', 'AK',
213
        'ACEY$', 'AZI', 'AZI',
214
        'ADV', 'ATW', None,
215
        'AEGL-', 'EK', None,
216
        'AEU<', 'EU', 'EU',
217
        'AE2', 'E', 'E',
218
        'AFTRAUBEN------', 'AFT ', 'AFT ',
219
        'AGL-1', 'AK', None,
220
        'AGNI-^', 'AKN', 'AKN',
221
        'AGNIE-', 'ANI', 'ANI',
222
        'AGN(AEOU)-$', 'ANI', 'ANI',
223
        'AH(AIOÖUÜY)-', 'AH', None,
224
        'AIA2', 'AIA', 'AIA',
225
        'AIE$', 'E', 'E',
226
        'AILL(EOU)-', 'ALI', 'ALI',
227
        'AINE$', 'EN', 'EN',
228
        'AIRE$', 'ER', 'ER',
229
        'AIR-', 'E', 'E',
230
        'AISE$', 'ES', 'EZ',
231
        'AISSANCE$', 'ESANS', 'EZANZ',
232
        'AISSE$', 'ES', 'EZ',
233
        'AIX$', 'EX', 'EX',
234
        'AJ(AÄEÈÉÊIOÖUÜ)--', 'A', 'A',
235
        'AKTIE', 'AXIE', 'AXIE',
236
        'AKTUEL', 'AKTUEL', None,
237
        'ALOI^', 'ALOI', 'ALUI',  # Don't merge these rules
238
        'ALOY^', 'ALOI', 'ALUI',  # needed by 'check_rules'
239
        'AMATEU(RS)-', 'AMATÖ', 'ANATÖ',
240
        'ANCH(OEI)-', 'ANSH', 'ANZ',
241
        'ANDERGEGANG----', 'ANDA GE', 'ANTA KE',
242
        'ANDERGEHE----', 'ANDA ', 'ANTA ',
243
        'ANDERGESETZ----', 'ANDA GE', 'ANTA KE',
244
        'ANDERGING----', 'ANDA ', 'ANTA ',
245
        'ANDERSETZ(ET)-----', 'ANDA ', 'ANTA ',
246
        'ANDERZUGEHE----', 'ANDA ZU ', 'ANTA ZU ',
247
        'ANDERZUSETZE-----', 'ANDA ZU ', 'ANTA ZU ',
248
        'ANER(BKO)---^^', 'AN', None,
249
        'ANHAND---^$', 'AN H', 'AN ',
250
        'ANH(AÄEIOÖUÜY)--^^', 'AN', None,
251
        'ANIELLE$', 'ANIEL', 'ANIL',
252
        'ANIEL', 'ANIEL', None,
253
        'ANSTELLE----^$', 'AN ST', 'AN ZT',
254
        'ANTI^^', 'ANTI', 'ANTI',
255
        'ANVER^^', 'ANFA', 'ANFA',
256
        'ATIA$', 'ATIA', 'ATIA',
257
        'ATIA(NS)--', 'ATI', 'ATI',
258
        'ATI(AÄOÖUÜ)-', 'AZI', 'AZI',
259
        'AUAU--', '', '',
260
        'AUERE$', 'AUERE', None,
261
        'AUERE(NS)-$', 'AUERE', None,
262
        'AUERE(AIOUY)--', 'AUER', None,
263
        'AUER(AÄIOÖUÜY)-', 'AUER', None,
264
        'AUER<', 'AUA', 'AUA',
265
        'AUF^^', 'AUF', 'AUF',
266
        'AULT$', 'O', 'U',
267
        'AUR(BCDFGKLMNQSTVWZ)-', 'AUA', 'AUA',
268
        'AUR$', 'AUA', 'AUA',
269
        'AUSSE$', 'OS', 'UZ',
270
        'AUS(ST)-^', 'AUS', 'AUS',
271
        'AUS^^', 'AUS', 'AUS',
272
        'AUTOFAHR----', 'AUTO ', 'AUTU ',
273
        'AUTO^^', 'AUTO', 'AUTU',
274
        'AUX(IY)-', 'AUX', 'AUX',
275
        'AUX', 'O', 'U',
276
        'AU', 'AU', 'AU',
277
        'AVER--<', 'AW', None,
278
        'AVIER$', 'AWIE', 'AFIE',
279
        'AV(EÈÉÊI)-^', 'AW', None,
280
        'AV(AOU)-', 'AW', None,
281
        'AYRE$', 'EIRE', 'EIRE',
282
        'AYRE(NS)-$', 'EIRE', 'EIRE',
283
        'AYRE(AIOUY)--', 'EIR', 'EIR',
284
        'AYR(AÄIOÖUÜY)-', 'EIR', 'EIR',
285
        'AYR<', 'EIA', 'EIA',
286
        'AYER--<', 'EI', 'EI',
287
        'AY(AÄEIOÖUÜY)--', 'A', 'A',
288
        'AË', 'E', 'E',
289
        'A(IJY)<', 'EI', 'EI',
290
        'BABY^$', 'BEBI', 'BEBI',
291
        'BAB(IY)^', 'BEBI', 'BEBI',
292
        'BEAU^$', 'BO', None,
293
        'BEA(BCMNRU)-^', 'BEA', 'BEA',
294
        'BEAT(AEIMORU)-^', 'BEAT', 'BEAT',
295
        'BEE$', 'BI', 'BI',
296
        'BEIGE^$', 'BESH', 'BEZ',
297
        'BENOIT--', 'BENO', 'BENU',
298
        'BER(DT)-', 'BER', None,
299
        'BERN(DT)-', 'BERN', None,
300
        'BE(LMNRST)-^', 'BE', 'BE',
301
        'BETTE$', 'BET', 'BET',
302
        'BEVOR^$', 'BEFOR', None,
303
        'BIC$', 'BIZ', 'BIZ',
304
        'BOWL(EI)-', 'BOL', 'BUL',
305
        'BP(AÄEÈÉÊIÌÍÎOÖRUÜY)-', 'B', 'B',
306
        'BRINGEND-----^', 'BRI', 'BRI',
307
        'BRINGEND-----', ' BRI', ' BRI',
308
        'BROW(NS)-', 'BRAU', 'BRAU',
309
        'BUDGET7', 'BÜGE', 'BIKE',
310
        'BUFFET7', 'BÜFE', 'BIFE',
311
        'BYLLE$', 'BILE', 'BILE',
312
        'BYLL$', 'BIL', 'BIL',
313
        'BYPA--^', 'BEI', 'BEI',
314
        'BYTE<', 'BEIT', 'BEIT',
315
        'BY9^', 'BÜ', None,
316
        'B(SßZ)$', 'BS', None,
317
        'CACH(EI)-^', 'KESH', 'KEZ',
318
        'CAE--', 'Z', 'Z',
319
        'CA(IY)$', 'ZEI', 'ZEI',
320
        'CE(EIJUY)--', 'Z', 'Z',
321
        'CENT<', 'ZENT', 'ZENT',
322
        'CERST(EI)----^', 'KE', 'KE',
323
        'CER$', 'ZA', 'ZA',
324
        'CE3', 'ZE', 'ZE',
325
        'CH\'S$', 'X', 'X',
326
        'CH´S$', 'X', 'X',
327
        'CHAO(ST)-', 'KAO', 'KAU',
328
        'CHAMPIO-^', 'SHEMPI', 'ZENBI',
329
        'CHAR(AI)-^', 'KAR', 'KAR',
330
        'CHAU(CDFSVWXZ)-', 'SHO', 'ZU',
331
        'CHÄ(CF)-', 'SHE', 'ZE',
332
        'CHE(CF)-', 'SHE', 'ZE',
333
        'CHEM-^', 'KE', 'KE',  # or: 'CHE', 'KE'
334
        'CHEQUE<', 'SHEK', 'ZEK',
335
        'CHI(CFGPVW)-', 'SHI', 'ZI',
336
        'CH(AEUY)-<^', 'SH', 'Z',
337
        'CHK-', '', '',
338
        'CHO(CKPS)-^', 'SHO', 'ZU',
339
        'CHRIS-', 'KRI', None,
340
        'CHRO-', 'KR', None,
341
        'CH(LOR)-<^', 'K', 'K',
342
        'CHST-', 'X', 'X',
343
        'CH(SßXZ)3', 'X', 'X',
344
        'CHTNI-3', 'CHN', 'KN',
345
        'CH^', 'K', 'K',  # or: 'CH', 'K'
346
        'CH', 'CH', 'K',
347
        'CIC$', 'ZIZ', 'ZIZ',
348
        'CIENCEFICT----', 'EIENS ', 'EIENZ ',
349
        'CIENCE$', 'EIENS', 'EIENZ',
350
        'CIER$', 'ZIE', 'ZIE',
351
        'CYB-^', 'ZEI', 'ZEI',
352
        'CY9^', 'ZÜ', 'ZI',
353
        'C(IJY)-<3', 'Z', 'Z',
354
        'CLOWN-', 'KLAU', 'KLAU',
355
        'CCH', 'Z', 'Z',
356
        'CCE-', 'X', 'X',
357
        'C(CK)-', '', '',
358
        'CLAUDET---', 'KLO', 'KLU',
359
        'CLAUDINE^$', 'KLODIN', 'KLUTIN',
360
        'COACH', 'KOSH', 'KUZ',
361
        'COLE$', 'KOL', 'KUL',
362
        'COUCH', 'KAUSH', 'KAUZ',
363
        'COW', 'KAU', 'KAU',
364
        'CQUES$', 'K', 'K',
365
        'CQUE', 'K', 'K',
366
        'CRASH--9', 'KRE', 'KRE',
367
        'CREAT-^', 'KREA', 'KREA',
368
        'CST', 'XT', 'XT',
369
        'CS<^', 'Z', 'Z',
370
        'C(SßX)', 'X', 'X',
371
        'CT\'S$', 'X', 'X',
372
        'CT(SßXZ)', 'X', 'X',
373
        'CZ<', 'Z', 'Z',
374
        'C(ÈÉÊÌÍÎÝ)3', 'Z', 'Z',
375
        'C.^', 'C.', 'C.',
376
        'CÄ-', 'Z', 'Z',
377
        'CÜ$', 'ZÜ', 'ZI',
378
        'C\'S$', 'X', 'X',
379
        'C<', 'K', 'K',
380
        'DAHER^$', 'DAHER', None,
381
        'DARAUFFOLGE-----', 'DARAUF ', 'TARAUF ',
382
        'DAVO(NR)-^$', 'DAFO', 'TAFU',
383
        'DD(SZ)--<', '', '',
384
        'DD9', 'D', None,
385
        'DEPOT7', 'DEPO', 'TEBU',
386
        'DESIGN', 'DISEIN', 'TIZEIN',
387
        'DE(LMNRST)-3^', 'DE', 'TE',
388
        'DETTE$', 'DET', 'TET',
389
        'DH$', 'T', None,
390
        'DIC$', 'DIZ', 'TIZ',
391
        'DIDR-^', 'DIT', None,
392
        'DIEDR-^', 'DIT', None,
393
        'DJ(AEIOU)-^', 'I', 'I',
394
        'DMITR-^', 'DIMIT', 'TINIT',
395
        'DRY9^', 'DRÜ', None,
396
        'DT-', '', '',
397
        'DUIS-^', 'DÜ', 'TI',
398
        'DURCH^^', 'DURCH', 'TURK',
399
        'DVA$', 'TWA', None,
400
        'DY9^', 'DÜ', None,
401
        'DYS$', 'DIS', None,
402
        'DS(CH)--<', 'T', 'T',
403
        'DST', 'ZT', 'ZT',
404
        'DZS(CH)--', 'T', 'T',
405
        'D(SßZ)', 'Z', 'Z',
406
        'D(AÄEIOÖRUÜY)-', 'D', None,
407
        'D(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'D', None,
408
        'D\'H^', 'D', 'T',
409
        'D´H^', 'D', 'T',
410
        'D`H^', 'D', 'T',
411
        'D\'S3$', 'Z', 'Z',
412
        'D´S3$', 'Z', 'Z',
413
        'D^', 'D', None,
414
        'D', 'T', 'T',
415
        'EAULT$', 'O', 'U',
416
        'EAUX$', 'O', 'U',
417
        'EAU', 'O', 'U',
418
        'EAV', 'IW', 'IF',
419
        'EAS3$', 'EAS', None,
420
        'EA(AÄEIOÖÜY)-3', 'EA', 'EA',
421
        'EA3$', 'EA', 'EA',
422
        'EA3', 'I', 'I',
423
        'EBENSO^$', 'EBNSO', 'EBNZU',
424
        'EBENSO^^', 'EBNSO ', 'EBNZU ',
425
        'EBEN^^', 'EBN', 'EBN',
426
        'EE9', 'E', 'E',
427
        'EGL-1', 'EK', None,
428
        'EHE(IUY)--1', 'EH', None,
429
        'EHUNG---1', 'E', None,
430
        'EH(AÄIOÖUÜY)-1', 'EH', None,
431
        'EIEI--', '', '',
432
        'EIERE^$', 'EIERE', None,
433
        'EIERE$', 'EIERE', None,
434
        'EIERE(NS)-$', 'EIERE', None,
435
        'EIERE(AIOUY)--', 'EIER', None,
436
        'EIER(AÄIOÖUÜY)-', 'EIER', None,
437
        'EIER<', 'EIA', None,
438
        'EIGL-1', 'EIK', None,
439
        'EIGH$', 'EI', 'EI',
440
        'EIH--', 'E', 'E',
441
        'EILLE$', 'EI', 'EI',
442
        'EIR(BCDFGKLMNQSTVWZ)-', 'EIA', 'EIA',
443
        'EIR$', 'EIA', 'EIA',
444
        'EITRAUBEN------', 'EIT ', 'EIT ',
445
        'EI', 'EI', 'EI',
446
        'EJ$', 'EI', 'EI',
447
        'ELIZ^', 'ELIS', None,
448
        'ELZ^', 'ELS', None,
449
        'EL-^', 'E', 'E',
450
        'ELANG----1', 'E', 'E',
451
        'EL(DKL)--1', 'E', 'E',
452
        'EL(MNT)--1$', 'E', 'E',
453
        'ELYNE$', 'ELINE', 'ELINE',
454
        'ELYN$', 'ELIN', 'ELIN',
455
        'EL(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'EL', 'EL',
456
        'EL-1', 'L', 'L',
457
        'EM-^', None, 'E',
458
        'EM(DFKMPQT)--1', None, 'E',
459
        'EM(AÄEÈÉÊIÌÍÎOÖUÜY)--1', None, 'E',
460
        'EM-1', None, 'N',
461
        'ENGAG-^', 'ANGA', 'ANKA',
462
        'EN-^', 'E', 'E',
463
        'ENTUEL', 'ENTUEL', None,
464
        'EN(CDGKQSTZ)--1', 'E', 'E',
465
        'EN(AÄEÈÉÊIÌÍÎNOÖUÜY)-1', 'EN', 'EN',
466
        'EN-1', '', '',
467
        'ERH(AÄEIOÖUÜ)-^', 'ERH', 'ER',
468
        'ER-^', 'E', 'E',
469
        'ERREGEND-----', ' ER', ' ER',
470
        'ERT1$', 'AT', None,
471
        'ER(DGLKMNRQTZß)-1', 'ER', None,
472
        'ER(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'ER', 'A',
473
        'ER1$', 'A', 'A',
474
        'ER<1', 'A', 'A',
475
        'ETAT7', 'ETA', 'ETA',
476
        'ETI(AÄOÖÜU)-', 'EZI', 'EZI',
477
        'EUERE$', 'EUERE', None,
478
        'EUERE(NS)-$', 'EUERE', None,
479
        'EUERE(AIOUY)--', 'EUER', None,
480
        'EUER(AÄIOÖUÜY)-', 'EUER', None,
481
        'EUER<', 'EUA', None,
482
        'EUEU--', '', '',
483
        'EUILLE$', 'Ö', 'Ö',
484
        'EUR$', 'ÖR', 'ÖR',
485
        'EUX', 'Ö', 'Ö',
486
        'EUSZ$', 'EUS', None,
487
        'EUTZ$', 'EUS', None,
488
        'EUYS$', 'EUS', 'EUZ',
489
        'EUZ$', 'EUS', None,
490
        'EU', 'EU', 'EU',
491
        'EVER--<1', 'EW', None,
492
        'EV(ÄOÖUÜ)-1', 'EW', None,
493
        'EYER<', 'EIA', 'EIA',
494
        'EY<', 'EI', 'EI',
495
        'FACETTE', 'FASET', 'FAZET',
496
        'FANS--^$', 'FE', 'FE',
497
        'FAN-^$', 'FE', 'FE',
498
        'FAULT-', 'FOL', 'FUL',
499
        'FEE(DL)-', 'FI', 'FI',
500
        'FEHLER', 'FELA', 'FELA',
501
        'FE(LMNRST)-3^', 'FE', 'FE',
502
        'FOERDERN---^', 'FÖRD', 'FÖRT',
503
        'FOERDERN---', ' FÖRD', ' FÖRT',
504
        'FOND7', 'FON', 'FUN',
505
        'FRAIN$', 'FRA', 'FRA',
506
        'FRISEU(RS)-', 'FRISÖ', 'FRIZÖ',
507
        'FY9^', 'FÜ', None,
508
        'FÖRDERN---^', 'FÖRD', 'FÖRT',
509
        'FÖRDERN---', ' FÖRD', ' FÖRT',
510
        'GAGS^$', 'GEX', 'KEX',
511
        'GAG^$', 'GEK', 'KEK',
512
        'GD', 'KT', 'KT',
513
        'GEGEN^^', 'GEGN', 'KEKN',
514
        'GEGENGEKOM-----', 'GEGN ', 'KEKN ',
515
        'GEGENGESET-----', 'GEGN ', 'KEKN ',
516
        'GEGENKOMME-----', 'GEGN ', 'KEKN ',
517
        'GEGENZUKOM---', 'GEGN ZU ', 'KEKN ZU ',
518
        'GENDETWAS-----$', 'GENT ', 'KENT ',
519
        'GENRE', 'IORE', 'IURE',
520
        'GE(LMNRST)-3^', 'GE', 'KE',
521
        'GER(DKT)-', 'GER', None,
522
        'GETTE$', 'GET', 'KET',
523
        'GGF.', 'GF.', None,
524
        'GG-', '', '',
525
        'GH', 'G', None,
526
        'GI(AOU)-^', 'I', 'I',
527
        'GION-3', 'KIO', 'KIU',
528
        'G(CK)-', '', '',
529
        'GJ(AEIOU)-^', 'I', 'I',
530
        'GMBH^$', 'GMBH', 'GMBH',
531
        'GNAC$', 'NIAK', 'NIAK',
532
        'GNON$', 'NION', 'NIUN',
533
        'GN$', 'N', 'N',
534
        'GONCAL-^', 'GONZA', 'KUNZA',
535
        'GRY9^', 'GRÜ', None,
536
        'G(SßXZ)-<', 'K', 'K',
537
        'GUCK-', 'KU', 'KU',
538
        'GUISEP-^', 'IUSE', 'IUZE',
539
        'GUI-^', 'G', 'K',
540
        'GUTAUSSEH------^', 'GUT ', 'KUT ',
541
        'GUTGEHEND------^', 'GUT ', 'KUT ',
542
        'GY9^', 'GÜ', None,
543
        'G(AÄEILOÖRUÜY)-', 'G', None,
544
        'G(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'G', None,
545
        'G\'S$', 'X', 'X',
546
        'G´S$', 'X', 'X',
547
        'G^', 'G', None,
548
        'G', 'K', 'K',
549
        'HA(HIUY)--1', 'H', None,
550
        'HANDVOL---^', 'HANT ', 'ANT ',
551
        'HANNOVE-^', 'HANOF', None,
552
        'HAVEN7$', 'HAFN', None,
553
        'HEAD-', 'HE', 'E',
554
        'HELIEGEN------', 'E ', 'E ',
555
        'HESTEHEN------', 'E ', 'E ',
556
        'HE(LMNRST)-3^', 'HE', 'E',
557
        'HE(LMN)-1', 'E', 'E',
558
        'HEUR1$', 'ÖR', 'ÖR',
559
        'HE(HIUY)--1', 'H', None,
560
        'HIH(AÄEIOÖUÜY)-1', 'IH', None,
561
        'HLH(AÄEIOÖUÜY)-1', 'LH', None,
562
        'HMH(AÄEIOÖUÜY)-1', 'MH', None,
563
        'HNH(AÄEIOÖUÜY)-1', 'NH', None,
564
        'HOBBY9^', 'HOBI', None,
565
        'HOCHBEGAB-----^', 'HOCH ', 'UK ',
566
        'HOCHTALEN-----^', 'HOCH ', 'UK ',
567
        'HOCHZUFRI-----^', 'HOCH ', 'UK ',
568
        'HO(HIY)--1', 'H', None,
569
        'HRH(AÄEIOÖUÜY)-1', 'RH', None,
570
        'HUH(AÄEIOÖUÜY)-1', 'UH', None,
571
        'HUIS^^', 'HÜS', 'IZ',
572
        'HUIS$', 'ÜS', 'IZ',
573
        'HUI--1', 'H', None,
574
        'HYGIEN^', 'HÜKIEN', None,
575
        'HY9^', 'HÜ', None,
576
        'HY(BDGMNPST)-', 'Ü', None,
577
        'H.^', None, 'H.',
578
        'HÄU--1', 'H', None,
579
        'H^', 'H', '',
580
        'H', '', '',
581
        'ICHELL---', 'ISH', 'IZ',
582
        'ICHI$', 'ISHI', 'IZI',
583
        'IEC$', 'IZ', 'IZ',
584
        'IEDENSTELLE------', 'IDN ', 'ITN ',
585
        'IEI-3', '', '',
586
        'IELL3', 'IEL', 'IEL',
587
        'IENNE$', 'IN', 'IN',
588
        'IERRE$', 'IER', 'IER',
589
        'IERZULAN---', 'IR ZU ', 'IR ZU ',
590
        'IETTE$', 'IT', 'IT',
591
        'IEU', 'IÖ', 'IÖ',
592
        'IE<4', 'I', 'I',
593
        'IGL-1', 'IK', None,
594
        'IGHT3$', 'EIT', 'EIT',
595
        'IGNI(EO)-', 'INI', 'INI',
596
        'IGN(AEOU)-$', 'INI', 'INI',
597
        'IHER(DGLKRT)--1', 'IHE', None,
598
        'IHE(IUY)--', 'IH', None,
599
        'IH(AIOÖUÜY)-', 'IH', None,
600
        'IJ(AOU)-', 'I', 'I',
601
        'IJ$', 'I', 'I',
602
        'IJ<', 'EI', 'EI',
603
        'IKOLE$', 'IKOL', 'IKUL',
604
        'ILLAN(STZ)--4', 'ILIA', 'ILIA',
605
        'ILLAR(DT)--4', 'ILIA', 'ILIA',
606
        'IMSTAN----^', 'IM ', 'IN ',
607
        'INDELERREGE------', 'INDL ', 'INTL ',
608
        'INFRAGE-----^$', 'IN ', 'IN ',
609
        'INTERN(AOU)-^', 'INTAN', 'INTAN',
610
        'INVER-', 'INWE', 'INFE',
611
        'ITI(AÄIOÖUÜ)-', 'IZI', 'IZI',
612
        'IUSZ$', 'IUS', None,
613
        'IUTZ$', 'IUS', None,
614
        'IUZ$', 'IUS', None,
615
        'IVER--<', 'IW', None,
616
        'IVIER$', 'IWIE', 'IFIE',
617
        'IV(ÄOÖUÜ)-', 'IW', None,
618
        'IV<3', 'IW', None,
619
        'IY2', 'I', None,
620
        'I(ÈÉÊ)<4', 'I', 'I',
621
        'JAVIE---<^', 'ZA', 'ZA',
622
        'JEANS^$', 'JINS', 'INZ',
623
        'JEANNE^$', 'IAN', 'IAN',
624
        'JEAN-^', 'IA', 'IA',
625
        'JER-^', 'IE', 'IE',
626
        'JE(LMNST)-', 'IE', 'IE',
627
        'JI^', 'JI', None,
628
        'JOR(GK)^$', 'IÖRK', 'IÖRK',
629
        'J', 'I', 'I',
630
        'KC(ÄEIJ)-', 'X', 'X',
631
        'KD', 'KT', None,
632
        'KE(LMNRST)-3^', 'KE', 'KE',
633
        'KG(AÄEILOÖRUÜY)-', 'K', None,
634
        'KH<^', 'K', 'K',
635
        'KIC$', 'KIZ', 'KIZ',
636
        'KLE(LMNRST)-3^', 'KLE', 'KLE',
637
        'KOTELE-^', 'KOTL', 'KUTL',
638
        'KREAT-^', 'KREA', 'KREA',
639
        'KRÜS(TZ)--^', 'KRI', None,
640
        'KRYS(TZ)--^', 'KRI', None,
641
        'KRY9^', 'KRÜ', None,
642
        'KSCH---', 'K', 'K',
643
        'KSH--', 'K', 'K',
644
        'K(SßXZ)7', 'X', 'X',  # implies 'KST' -> 'XT'
645
        'KT\'S$', 'X', 'X',
646
        'KTI(AIOU)-3', 'XI', 'XI',
647
        'KT(SßXZ)', 'X', 'X',
648
        'KY9^', 'KÜ', None,
649
        'K\'S$', 'X', 'X',
650
        'K´S$', 'X', 'X',
651
        'LANGES$', ' LANGES', ' LANKEZ',
652
        'LANGE$', ' LANGE', ' LANKE',
653
        'LANG$', ' LANK', ' LANK',
654
        'LARVE-', 'LARF', 'LARF',
655
        'LD(SßZ)$', 'LS', 'LZ',
656
        'LD\'S$', 'LS', 'LZ',
657
        'LD´S$', 'LS', 'LZ',
658
        'LEAND-^', 'LEAN', 'LEAN',
659
        'LEERSTEHE-----^', 'LER ', 'LER ',
660
        'LEICHBLEIB-----', 'LEICH ', 'LEIK ',
661
        'LEICHLAUTE-----', 'LEICH ', 'LEIK ',
662
        'LEIDERREGE------', 'LEIT ', 'LEIT ',
663
        'LEIDGEPR----^', 'LEIT ', 'LEIT ',
664
        'LEINSTEHE-----', 'LEIN ', 'LEIN ',
665
        'LEL-', 'LE', 'LE',
666
        'LE(MNRST)-3^', 'LE', 'LE',
667
        'LETTE$', 'LET', 'LET',
668
        'LFGNAG-', 'LFGAN', 'LFKAN',
669
        'LICHERWEIS----', 'LICHA ', 'LIKA ',
670
        'LIC$', 'LIZ', 'LIZ',
671
        'LIVE^$', 'LEIF', 'LEIF',
672
        'LT(SßZ)$', 'LS', 'LZ',
673
        'LT\'S$', 'LS', 'LZ',
674
        'LT´S$', 'LS', 'LZ',
675
        'LUI(GS)--', 'LU', 'LU',
676
        'LV(AIO)-', 'LW', None,
677
        'LY9^', 'LÜ', None,
678
        'LSTS$', 'LS', 'LZ',
679
        'LZ(BDFGKLMNPQRSTVWX)-', 'LS', None,
680
        'L(SßZ)$', 'LS', None,
681
        'MAIR-<', 'MEI', 'NEI',
682
        'MANAG-', 'MENE', 'NENE',
683
        'MANUEL', 'MANUEL', None,
684
        'MASSEU(RS)-', 'MASÖ', 'NAZÖ',
685
        'MATCH', 'MESH', 'NEZ',
686
        'MAURICE', 'MORIS', 'NURIZ',
687
        'MBH^$', 'MBH', 'MBH',
688
        'MB(ßZ)$', 'MS', None,
689
        'MB(SßTZ)-', 'M', 'N',
690
        'MCG9^', 'MAK', 'NAK',
691
        'MC9^', 'MAK', 'NAK',
692
        'MEMOIR-^', 'MEMOA', 'NENUA',
693
        'MERHAVEN$', 'MAHAFN', None,
694
        'ME(LMNRST)-3^', 'ME', 'NE',
695
        'MEN(STZ)--3', 'ME', None,
696
        'MEN$', 'MEN', None,
697
        'MIGUEL-', 'MIGE', 'NIKE',
698
        'MIKE^$', 'MEIK', 'NEIK',
699
        'MITHILFE----^$', 'MIT H', 'NIT ',
700
        'MN$', 'M', None,
701
        'MN', 'N', 'N',
702
        'MPJUTE-', 'MPUT', 'NBUT',
703
        'MP(ßZ)$', 'MS', None,
704
        'MP(SßTZ)-', 'M', 'N',
705
        'MP(BDJLMNPQVW)-', 'MB', 'NB',
706
        'MY9^', 'MÜ', None,
707
        'M(ßZ)$', 'MS', None,
708
        'M´G7^', 'MAK', 'NAK',
709
        'M\'G7^', 'MAK', 'NAK',
710
        'M´^', 'MAK', 'NAK',
711
        'M\'^', 'MAK', 'NAK',
712
        'M', None, 'N',
713
        'NACH^^', 'NACH', 'NAK',
714
        'NADINE', 'NADIN', 'NATIN',
715
        'NAIV--', 'NA', 'NA',
716
        'NAISE$', 'NESE', 'NEZE',
717
        'NAUGENOMM------', 'NAU ', 'NAU ',
718
        'NAUSOGUT$', 'NAUSO GUT', 'NAUZU KUT',
719
        'NCH$', 'NSH', 'NZ',
720
        'NCOISE$', 'SOA', 'ZUA',
721
        'NCOIS$', 'SOA', 'ZUA',
722
        'NDAR$', 'NDA', 'NTA',
723
        'NDERINGEN------', 'NDE ', 'NTE ',
724
        'NDRO(CDKTZ)-', 'NTRO', None,
725
        'ND(BFGJLMNPQVW)-', 'NT', None,
726
        'ND(SßZ)$', 'NS', 'NZ',
727
        'ND\'S$', 'NS', 'NZ',
728
        'ND´S$', 'NS', 'NZ',
729
        'NEBEN^^', 'NEBN', 'NEBN',
730
        'NENGELERN------', 'NEN ', 'NEN ',
731
        'NENLERN(ET)---', 'NEN LE', 'NEN LE',
732
        'NENZULERNE---', 'NEN ZU LE', 'NEN ZU LE',
733
        'NE(LMNRST)-3^', 'NE', 'NE',
734
        'NEN-3', 'NE', 'NE',
735
        'NETTE$', 'NET', 'NET',
736
        'NGU^^', 'NU', 'NU',
737
        'NG(BDFJLMNPQRTVW)-', 'NK', 'NK',
738
        'NH(AUO)-$', 'NI', 'NI',
739
        'NICHTSAHNEN-----', 'NIX ', 'NIX ',
740
        'NICHTSSAGE----', 'NIX ', 'NIX ',
741
        'NICHTS^^', 'NIX', 'NIX',
742
        'NICHT^^', 'NICHT', 'NIKT',
743
        'NINE$', 'NIN', 'NIN',
744
        'NON^^', 'NON', 'NUN',
745
        'NOTLEIDE-----^', 'NOT ', 'NUT ',
746
        'NOT^^', 'NOT', 'NUT',
747
        'NTI(AIOU)-3', 'NZI', 'NZI',
748
        'NTIEL--3', 'NZI', 'NZI',
749
        'NT(SßZ)$', 'NS', 'NZ',
750
        'NT\'S$', 'NS', 'NZ',
751
        'NT´S$', 'NS', 'NZ',
752
        'NYLON', 'NEILON', 'NEILUN',
753
        'NY9^', 'NÜ', None,
754
        'NSTZUNEH---', 'NST ZU ', 'NZT ZU ',
755
        'NSZ-', 'NS', None,
756
        'NSTS$', 'NS', 'NZ',
757
        'NZ(BDFGKLMNPQRSTVWX)-', 'NS', None,
758
        'N(SßZ)$', 'NS', None,
759
        'OBERE-', 'OBER', None,
760
        'OBER^^', 'OBA', 'UBA',
761
        'OEU2', 'Ö', 'Ö',
762
        'OE<2', 'Ö', 'Ö',
763
        'OGL-', 'OK', None,
764
        'OGNIE-', 'ONI', 'UNI',
765
        'OGN(AEOU)-$', 'ONI', 'UNI',
766
        'OH(AIOÖUÜY)-', 'OH', None,
767
        'OIE$', 'Ö', 'Ö',
768
        'OIRE$', 'OA', 'UA',
769
        'OIR$', 'OA', 'UA',
770
        'OIX', 'OA', 'UA',
771
        'OI<3', 'EU', 'EU',
772
        'OKAY^$', 'OKE', 'UKE',
773
        'OLYN$', 'OLIN', 'ULIN',
774
        'OO(DLMZ)-', 'U', None,
775
        'OO$', 'U', None,
776
        'OO-', '', '',
777
        'ORGINAL-----', 'ORI', 'URI',
778
        'OTI(AÄOÖUÜ)-', 'OZI', 'UZI',
779
        'OUI^', 'WI', 'FI',
780
        'OUILLE$', 'ULIE', 'ULIE',
781
        'OU(DT)-^', 'AU', 'AU',
782
        'OUSE$', 'AUS', 'AUZ',
783
        'OUT-', 'AU', 'AU',
784
        'OU', 'U', 'U',
785
        'O(FV)$', 'AU', 'AU',  # due to 'OW$' -> 'AU'
786
        'OVER--<', 'OW', None,
787
        'OV(AOU)-', 'OW', None,
788
        'OW$', 'AU', 'AU',
789
        'OWS$', 'OS', 'UZ',
790
        'OJ(AÄEIOÖUÜ)--', 'O', 'U',
791
        'OYER', 'OIA', None,
792
        'OY(AÄEIOÖUÜ)--', 'O', 'U',
793
        'O(JY)<', 'EU', 'EU',
794
        'OZ$', 'OS', None,
795
        'O´^', 'O', 'U',
796
        'O\'^', 'O', 'U',
797
        'O', None, 'U',
798
        'PATIEN--^', 'PAZI', 'PAZI',
799
        'PENSIO-^', 'PANSI', 'PANZI',
800
        'PE(LMNRST)-3^', 'PE', 'PE',
801
        'PFER-^', 'FE', 'FE',
802
        'P(FH)<', 'F', 'F',
803
        'PIC^$', 'PIK', 'PIK',
804
        'PIC$', 'PIZ', 'PIZ',
805
        'PIPELINE', 'PEIBLEIN', 'PEIBLEIN',
806
        'POLYP-', 'POLÜ', None,
807
        'POLY^^', 'POLI', 'PULI',
808
        'PORTRAIT7', 'PORTRE', 'PURTRE',
809
        'POWER7', 'PAUA', 'PAUA',
810
        'PP(FH)--<', 'B', 'B',
811
        'PP-', '', '',
812
        'PRODUZ-^', 'PRODU', 'BRUTU',
813
        'PRODUZI--', ' PRODU', ' BRUTU',
814
        'PRIX^$', 'PRI', 'PRI',
815
        'PS-^^', 'P', None,
816
        'P(SßZ)^', None, 'Z',
817
        'P(SßZ)$', 'BS', None,
818
        'PT-^', '', '',
819
        'PTI(AÄOÖUÜ)-3', 'BZI', 'BZI',
820
        'PY9^', 'PÜ', None,
821
        'P(AÄEIOÖRUÜY)-', 'P', 'P',
822
        'P(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'P', None,
823
        'P.^', None, 'P.',
824
        'P^', 'P', None,
825
        'P', 'B', 'B',
826
        'QI-', 'Z', 'Z',
827
        'QUARANT--', 'KARA', 'KARA',
828
        'QUE(LMNRST)-3', 'KWE', 'KFE',
829
        'QUE$', 'K', 'K',
830
        'QUI(NS)$', 'KI', 'KI',
831
        'QUIZ7', 'KWIS', None,
832
        'Q(UV)7', 'KW', 'KF',
833
        'Q<', 'K', 'K',
834
        'RADFAHR----', 'RAT ', 'RAT ',
835
        'RAEFTEZEHRE-----', 'REFTE ', 'REFTE ',
836
        'RCH', 'RCH', 'RK',
837
        'REA(DU)---3^', 'R', None,
838
        'REBSERZEUG------', 'REBS ', 'REBZ ',
839
        'RECHERCH^', 'RESHASH', 'REZAZ',
840
        'RECYCL--', 'RIZEI', 'RIZEI',
841
        'RE(ALST)-3^', 'RE', None,
842
        'REE$', 'RI', 'RI',
843
        'RER$', 'RA', 'RA',
844
        'RE(MNR)-4', 'RE', 'RE',
845
        'RETTE$', 'RET', 'RET',
846
        'REUZ$', 'REUZ', None,
847
        'REW$', 'RU', 'RU',
848
        'RH<^', 'R', 'R',
849
        'RJA(MN)--', 'RI', 'RI',
850
        'ROWD-^', 'RAU', 'RAU',
851
        'RTEMONNAIE-', 'RTMON', 'RTNUN',
852
        'RTI(AÄOÖUÜ)-3', 'RZI', 'RZI',
853
        'RTIEL--3', 'RZI', 'RZI',
854
        'RV(AEOU)-3', 'RW', None,
855
        'RY(KN)-$', 'RI', 'RI',
856
        'RY9^', 'RÜ', None,
857
        'RÄFTEZEHRE-----', 'REFTE ', 'REFTE ',
858
        'SAISO-^', 'SES', 'ZEZ',
859
        'SAFE^$', 'SEIF', 'ZEIF',
860
        'SAUCE-^', 'SOS', 'ZUZ',
861
        'SCHLAGGEBEN-----<', 'SHLAK ', 'ZLAK ',
862
        'SCHSCH---7', '', '',
863
        'SCHTSCH', 'SH', 'Z',
864
        'SC(HZ)<', 'SH', 'Z',
865
        'SC', 'SK', 'ZK',
866
        'SELBSTST--7^^', 'SELB', 'ZELB',
867
        'SELBST7^^', 'SELBST', 'ZELBZT',
868
        'SERVICE7^', 'SÖRWIS', 'ZÖRFIZ',
869
        'SERVI-^', 'SERW', None,
870
        'SE(LMNRST)-3^', 'SE', 'ZE',
871
        'SETTE$', 'SET', 'ZET',
872
        'SHP-^', 'S', 'Z',
873
        'SHST', 'SHT', 'ZT',
874
        'SHTSH', 'SH', 'Z',
875
        'SHT', 'ST', 'Z',
876
        'SHY9^', 'SHÜ', None,
877
        'SH^^', 'SH', None,
878
        'SH3', 'SH', 'Z',
879
        'SICHERGEGAN-----^', 'SICHA ', 'ZIKA ',
880
        'SICHERGEHE----^', 'SICHA ', 'ZIKA ',
881
        'SICHERGESTEL------^', 'SICHA ', 'ZIKA ',
882
        'SICHERSTELL-----^', 'SICHA ', 'ZIKA ',
883
        'SICHERZU(GS)--^', 'SICHA ZU ', 'ZIKA ZU ',
884
        'SIEGLI-^', 'SIKL', 'ZIKL',
885
        'SIGLI-^', 'SIKL', 'ZIKL',
886
        'SIGHT', 'SEIT', 'ZEIT',
887
        'SIGN', 'SEIN', 'ZEIN',
888
        'SKI(NPZ)-', 'SKI', 'ZKI',
889
        'SKI<^', 'SHI', 'ZI',
890
        'SODASS^$', 'SO DAS', 'ZU TAZ',
891
        'SODAß^$', 'SO DAS', 'ZU TAZ',
892
        'SOGENAN--^', 'SO GEN', 'ZU KEN',
893
        'SOUND-', 'SAUN', 'ZAUN',
894
        'STAATS^^', 'STAZ', 'ZTAZ',
895
        'STADT^^', 'STAT', 'ZTAT',
896
        'STANDE$', ' STANDE', ' ZTANTE',
897
        'START^^', 'START', 'ZTART',
898
        'STAURANT7', 'STORAN', 'ZTURAN',
899
        'STEAK-', 'STE', 'ZTE',
900
        'STEPHEN-^$', 'STEW', None,
901
        'STERN', 'STERN', None,
902
        'STRAF^^', 'STRAF', 'ZTRAF',
903
        'ST\'S$', 'Z', 'Z',
904
        'ST´S$', 'Z', 'Z',
905
        'STST--', '', '',
906
        'STS(ACEÈÉÊHIÌÍÎOUÄÜÖ)--', 'ST', 'ZT',
907
        'ST(SZ)', 'Z', 'Z',
908
        'SPAREN---^', 'SPA', 'ZPA',
909
        'SPAREND----', ' SPA', ' ZPA',
910
        'S(PTW)-^^', 'S', None,
911
        'SP', 'SP', None,
912
        'STYN(AE)-$', 'STIN', 'ZTIN',
913
        'ST', 'ST', 'ZT',
914
        'SUITE<', 'SIUT', 'ZIUT',
915
        'SUKE--$', 'S', 'Z',
916
        'SURF(EI)-', 'SÖRF', 'ZÖRF',
917
        'SV(AEÈÉÊIÌÍÎOU)-<^', 'SW', None,
918
        'SYB(IY)--^', 'SIB', None,
919
        'SYL(KVW)--^', 'SI', None,
920
        'SY9^', 'SÜ', None,
921
        'SZE(NPT)-^', 'ZE', 'ZE',
922
        'SZI(ELN)-^', 'ZI', 'ZI',
923
        'SZCZ<', 'SH', 'Z',
924
        'SZT<', 'ST', 'ZT',
925
        'SZ<3', 'SH', 'Z',
926
        'SÜL(KVW)--^', 'SI', None,
927
        'S', None, 'Z',
928
        'TCH', 'SH', 'Z',
929
        'TD(AÄEIOÖRUÜY)-', 'T', None,
930
        'TD(ÀÁÂÃÅÈÉÊËÌÍÎÏÒÓÔÕØÙÚÛÝŸ)-', 'T', None,
931
        'TEAT-^', 'TEA', 'TEA',
932
        'TERRAI7^', 'TERA', 'TERA',
933
        'TE(LMNRST)-3^', 'TE', 'TE',
934
        'TH<', 'T', 'T',
935
        'TICHT-', 'TIK', 'TIK',
936
        'TICH$', 'TIK', 'TIK',
937
        'TIC$', 'TIZ', 'TIZ',
938
        'TIGGESTELL-------', 'TIK ', 'TIK ',
939
        'TIGSTELL-----', 'TIK ', 'TIK ',
940
        'TOAS-^', 'TO', 'TU',
941
        'TOILET-', 'TOLE', 'TULE',
942
        'TOIN-', 'TOA', 'TUA',
943
        'TRAECHTI-^', 'TRECHT', 'TREKT',
944
        'TRAECHTIG--', ' TRECHT', ' TREKT',
945
        'TRAINI-', 'TREN', 'TREN',
946
        'TRÄCHTI-^', 'TRECHT', 'TREKT',
947
        'TRÄCHTIG--', ' TRECHT', ' TREKT',
948
        'TSCH', 'SH', 'Z',
949
        'TSH', 'SH', 'Z',
950
        'TST', 'ZT', 'ZT',
951
        'T(Sß)', 'Z', 'Z',
952
        'TT(SZ)--<', '', '',
953
        'TT9', 'T', 'T',
954
        'TV^$', 'TV', 'TV',
955
        'TX(AEIOU)-3', 'SH', 'Z',
956
        'TY9^', 'TÜ', None,
957
        'TZ-', '', '',
958
        'T\'S3$', 'Z', 'Z',
959
        'T´S3$', 'Z', 'Z',
960
        'UEBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
961
        'UEBER^^', 'ÜBA', 'IBA',
962
        'UE2', 'Ü', 'I',
963
        'UGL-', 'UK', None,
964
        'UH(AOÖUÜY)-', 'UH', None,
965
        'UIE$', 'Ü', 'I',
966
        'UM^^', 'UM', 'UN',
967
        'UNTERE--3', 'UNTE', 'UNTE',
968
        'UNTER^^', 'UNTA', 'UNTA',
969
        'UNVER^^', 'UNFA', 'UNFA',
970
        'UN^^', 'UN', 'UN',
971
        'UTI(AÄOÖUÜ)-', 'UZI', 'UZI',
972
        'UVE-4', 'UW', None,
973
        'UY2', 'UI', None,
974
        'UZZ', 'AS', 'AZ',
975
        'VACL-^', 'WAZ', 'FAZ',
976
        'VAC$', 'WAZ', 'FAZ',
977
        'VAN DEN ^', 'FANDN', 'FANTN',
978
        'VANES-^', 'WANE', None,
979
        'VATRO-', 'WATR', None,
980
        'VA(DHJNT)--^', 'F', None,
981
        'VEDD-^', 'FE', 'FE',
982
        'VE(BEHIU)--^', 'F', None,
983
        'VEL(BDLMNT)-^', 'FEL', None,
984
        'VENTZ-^', 'FEN', None,
985
        'VEN(NRSZ)-^', 'FEN', None,
986
        'VER(AB)-^$', 'WER', None,
987
        'VERBAL^$', 'WERBAL', None,
988
        'VERBAL(EINS)-^', 'WERBAL', None,
989
        'VERTEBR--', 'WERTE', None,
990
        'VEREIN-----', 'F', None,
991
        'VEREN(AEIOU)-^', 'WEREN', None,
992
        'VERIFI', 'WERIFI', None,
993
        'VERON(AEIOU)-^', 'WERON', None,
994
        'VERSEN^', 'FERSN', 'FAZN',
995
        'VERSIERT--^', 'WERSI', None,
996
        'VERSIO--^', 'WERS', None,
997
        'VERSUS', 'WERSUS', None,
998
        'VERTI(GK)-', 'WERTI', None,
999
        'VER^^', 'FER', 'FA',
1000
        'VERSPRECHE-------', ' FER', ' FA',
1001
        'VER$', 'WA', None,
1002
        'VER', 'FA', 'FA',
1003
        'VET(HT)-^', 'FET', 'FET',
1004
        'VETTE$', 'WET', 'FET',
1005
        'VE^', 'WE', None,
1006
        'VIC$', 'WIZ', 'FIZ',
1007
        'VIELSAGE----', 'FIL ', 'FIL ',
1008
        'VIEL', 'FIL', 'FIL',
1009
        'VIEW', 'WIU', 'FIU',
1010
        'VILL(AE)-', 'WIL', None,
1011
        'VIS(ACEIKUVWZ)-<^', 'WIS', None,
1012
        'VI(ELS)--^', 'F', None,
1013
        'VILLON--', 'WILI', 'FILI',
1014
        'VIZE^^', 'FIZE', 'FIZE',
1015
        'VLIE--^', 'FL', None,
1016
        'VL(AEIOU)--', 'W', None,
1017
        'VOKA-^', 'WOK', None,
1018
        'VOL(ATUVW)--^', 'WO', None,
1019
        'VOR^^', 'FOR', 'FUR',
1020
        'VR(AEIOU)--', 'W', None,
1021
        'VV9', 'W', None,
1022
        'VY9^', 'WÜ', 'FI',
1023
        'V(ÜY)-', 'W', None,
1024
        'V(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'W', None,
1025
        'V(AEIJLRU)-<', 'W', None,
1026
        'V.^', 'V.', None,
1027
        'V<', 'F', 'F',
1028
        'WEITERENTWI-----^', 'WEITA ', 'FEITA ',
1029
        'WEITREICH-----^', 'WEIT ', 'FEIT ',
1030
        'WEITVER^', 'WEIT FER', 'FEIT FA',
1031
        'WE(LMNRST)-3^', 'WE', 'FE',
1032
        'WER(DST)-', 'WER', None,
1033
        'WIC$', 'WIZ', 'FIZ',
1034
        'WIEDERU--', 'WIDE', 'FITE',
1035
        'WIEDER^$', 'WIDA', 'FITA',
1036
        'WIEDER^^', 'WIDA ', 'FITA ',
1037
        'WIEVIEL', 'WI FIL', 'FI FIL',
1038
        'WISUEL', 'WISUEL', None,
1039
        'WR-^', 'W', None,
1040
        'WY9^', 'WÜ', 'FI',
1041
        'W(BDFGJKLMNPQRSTZ)-', 'F', None,
1042
        'W$', 'F', None,
1043
        'W', None, 'F',
1044
        'X<^', 'Z', 'Z',
1045
        'XHAVEN$', 'XAFN', None,
1046
        'X(CSZ)', 'X', 'X',
1047
        'XTS(CH)--', 'XT', 'XT',
1048
        'XT(SZ)', 'Z', 'Z',
1049
        'YE(LMNRST)-3^', 'IE', 'IE',
1050
        'YE-3', 'I', 'I',
1051
        'YOR(GK)^$', 'IÖRK', 'IÖRK',
1052
        'Y(AOU)-<7', 'I', 'I',
1053
        'Y(BKLMNPRSTX)-1', 'Ü', None,
1054
        'YVES^$', 'IF', 'IF',
1055
        'YVONNE^$', 'IWON', 'IFUN',
1056
        'Y.^', 'Y.', None,
1057
        'Y', 'I', 'I',
1058
        'ZC(AOU)-', 'SK', 'ZK',
1059
        'ZE(LMNRST)-3^', 'ZE', 'ZE',
1060
        'ZIEJ$', 'ZI', 'ZI',
1061
        'ZIGERJA(HR)-3', 'ZIGA IA', 'ZIKA IA',
1062
        'ZL(AEIOU)-', 'SL', None,
1063
        'ZS(CHT)--', '', '',
1064
        'ZS', 'SH', 'Z',
1065
        'ZUERST', 'ZUERST', 'ZUERST',
1066
        'ZUGRUNDE^$', 'ZU GRUNDE', 'ZU KRUNTE',
1067
        'ZUGRUNDE', 'ZU GRUNDE ', 'ZU KRUNTE ',
1068
        'ZUGUNSTEN', 'ZU GUNSTN', 'ZU KUNZTN',
1069
        'ZUHAUSE-', 'ZU HAUS', 'ZU AUZ',
1070
        'ZULASTEN^$', 'ZU LASTN', 'ZU LAZTN',
1071
        'ZURUECK^^', 'ZURÜK', 'ZURIK',
1072
        'ZURZEIT', 'ZUR ZEIT', 'ZUR ZEIT',
1073
        'ZURÜCK^^', 'ZURÜK', 'ZURIK',
1074
        'ZUSTANDE', 'ZU STANDE', 'ZU ZTANTE',
1075
        'ZUTAGE', 'ZU TAGE', 'ZU TAKE',
1076
        'ZUVER^^', 'ZUFA', 'ZUFA',
1077
        'ZUVIEL', 'ZU FIL', 'ZU FIL',
1078
        'ZUWENIG', 'ZU WENIK', 'ZU FENIK',
1079
        'ZY9^', 'ZÜ', None,
1080
        'ZYK3$', 'ZIK', None,
1081
        'Z(VW)7^', 'SW', None,
1082
        None, None, None)
1083
1084
    phonet_hash = Counter()
1085
    alpha_pos = Counter()
1086
1087
    phonet_hash_1 = Counter()
1088
    phonet_hash_2 = Counter()
1089
1090
    _phonet_upper_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
1091
                                          'abcdefghijklmnopqrstuvwxyzàáâãåäæ' +
1092
                                          'çðèéêëìíîïñòóôõöøœšßþùúûüýÿ'),
1093
                                         'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÅÄÆ' +
1094
                                         'ÇÐÈÉÊËÌÍÎÏÑÒÓÔÕÖØŒŠßÞÙÚÛÜÝŸ'))
1095
1096
    def _initialize_phonet(lang):
1097
        """Initialize phonet variables."""
1098
        if lang == 'none':
1099
            _phonet_rules = _phonet_rules_no_lang
1100
        else:
1101
            _phonet_rules = _phonet_rules_german
1102
1103
        phonet_hash[''] = -1
1104
1105
        # German and international umlauts
1106
        for j in {'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë',
1107
                  'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø',
1108
                  'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß', 'Œ', 'Š', 'Ÿ'}:
1109
            alpha_pos[j] = 1
1110
            phonet_hash[j] = -1
1111
1112
        # "normal" letters ('A'-'Z')
1113
        for i, j in enumerate('ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
1114
            alpha_pos[j] = i + 2
1115
            phonet_hash[j] = -1
1116
1117
        for i in range(26):
1118
            for j in range(28):
1119
                phonet_hash_1[i, j] = -1
1120
                phonet_hash_2[i, j] = -1
1121
1122
        # for each phonetc rule
1123
        for i in range(len(_phonet_rules)):
1124
            rule = _phonet_rules[i]
1125
1126
            if rule and i % 3 == 0:
1127
                # calculate first hash value
1128
                k = _phonet_rules[i][0]
1129
1130
                if phonet_hash[k] < 0 and (_phonet_rules[i+1] or
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash does not seem to be defined.
Loading history...
1131
                                           _phonet_rules[i+2]):
1132
                    phonet_hash[k] = i
1133
1134
                # calculate second hash values
1135
                if k and alpha_pos[k] >= 2:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable alpha_pos does not seem to be defined.
Loading history...
1136
                    k = alpha_pos[k]
1137
1138
                    j = k-2
1139
                    rule = rule[1:]
1140
1141
                    if not rule:
1142
                        rule = ' '
1143
                    elif rule[0] == '(':
1144
                        rule = rule[1:]
1145
                    else:
1146
                        rule = rule[0]
1147
1148
                    while rule and (rule[0] != ')'):
1149
                        k = alpha_pos[rule[0]]
1150
1151
                        if k > 0:
1152
                            # add hash value for this letter
1153
                            if phonet_hash_1[j, k] < 0:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash_1 does not seem to be defined.
Loading history...
1154
                                phonet_hash_1[j, k] = i
1155
                                phonet_hash_2[j, k] = i
1156
1157
                            if phonet_hash_2[j, k] >= (i-30):
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash_2 does not seem to be defined.
Loading history...
1158
                                phonet_hash_2[j, k] = i
1159
                            else:
1160
                                k = -1
1161
1162
                        if k <= 0:
1163
                            # add hash value for all letters
1164
                            if phonet_hash_1[j, 0] < 0:
1165
                                phonet_hash_1[j, 0] = i
1166
1167
                            phonet_hash_2[j, 0] = i
1168
1169
                        rule = rule[1:]
1170
1171
    def _phonet(term, mode, lang):
1172
        """Return the phonet coded form of a term."""
1173
        if lang == 'none':
1174
            _phonet_rules = _phonet_rules_no_lang
1175
        else:
1176
            _phonet_rules = _phonet_rules_german
1177
1178
        char0 = ''
1179
        dest = term
1180
1181
        if not term:
1182
            return ''
1183
1184
        term_length = len(term)
1185
1186
        # convert input string to upper-case
1187
        src = term.translate(_phonet_upper_translation)
1188
1189
        # check "src"
1190
        i = 0
1191
        j = 0
1192
        zeta = 0
1193
1194
        while i < len(src):
1195
            char = src[i]
1196
1197
            pos = alpha_pos[char]
1198
1199
            if pos >= 2:
1200
                xpos = pos-2
1201
1202
                if i+1 == len(src):
1203
                    pos = alpha_pos['']
1204
                else:
1205
                    pos = alpha_pos[src[i+1]]
1206
1207
                start1 = phonet_hash_1[xpos, pos]
1208
                start2 = phonet_hash_1[xpos, 0]
1209
                end1 = phonet_hash_2[xpos, pos]
1210
                end2 = phonet_hash_2[xpos, 0]
1211
1212
                # preserve rule priorities
1213
                if (start2 >= 0) and ((start1 < 0) or (start2 < start1)):
1214
                    pos = start1
1215
                    start1 = start2
1216
                    start2 = pos
1217
                    pos = end1
1218
                    end1 = end2
1219
                    end2 = pos
1220
1221
                if (end1 >= start2) and (start2 >= 0):
1222
                    if end2 > end1:
1223
                        end1 = end2
1224
1225
                    start2 = -1
1226
                    end2 = -1
1227
            else:
1228
                pos = phonet_hash[char]
1229
                start1 = pos
1230
                end1 = 10000
1231
                start2 = -1
1232
                end2 = -1
1233
1234
            pos = start1
1235
            zeta0 = 0
1236
1237
            if pos >= 0:
1238
                # check rules for this char
1239
                while ((_phonet_rules[pos] is None) or
1240
                       (_phonet_rules[pos][0] == char)):
1241
                    if pos > end1:
1242
                        if start2 > 0:
1243
                            pos = start2
1244
                            start1 = start2
1245
                            start2 = -1
1246
                            end1 = end2
1247
                            end2 = -1
1248
                            continue
1249
1250
                        break
1251
1252
                    if (((_phonet_rules[pos] is None) or
1253
                         (_phonet_rules[pos + mode] is None))):
1254
                        # no conversion rule available
1255
                        pos += 3
1256
                        continue
1257
1258
                    # check whole string
1259
                    matches = 1  # number of matching letters
1260
                    priority = 5  # default priority
1261
                    rule = _phonet_rules[pos]
1262
                    rule = rule[1:]
1263
1264
                    while (rule and
1265
                           (len(src) > (i + matches)) and
1266
                           (src[i + matches] == rule[0]) and
1267
                           not rule[0].isdigit() and
1268
                           (rule not in '(-<^$')):
1269
                        matches += 1
1270
                        rule = rule[1:]
1271
1272
                    if rule and (rule[0] == '('):
1273
                        # check an array of letters
1274
                        if (((len(src) > (i + matches)) and
1275
                             src[i + matches].isalpha() and
1276
                             (src[i + matches] in rule[1:]))):
1277
                            matches += 1
1278
1279
                            while rule and rule[0] != ')':
1280
                                rule = rule[1:]
1281
1282
                            # if rule[0] == ')':
1283
                            rule = rule[1:]
1284
1285
                    if rule:
1286
                        priority0 = ord(rule[0])
1287
                    else:
1288
                        priority0 = 0
1289
1290
                    matches0 = matches
1291
1292
                    while rule and rule[0] == '-' and matches > 1:
1293
                        matches -= 1
1294
                        rule = rule[1:]
1295
1296
                    if rule and rule[0] == '<':
1297
                        rule = rule[1:]
1298
1299
                    if rule and rule[0].isdigit():
1300
                        # read priority
1301
                        priority = int(rule[0])
1302
                        rule = rule[1:]
1303
1304
                    if rule and rule[0:2] == '^^':
1305
                        rule = rule[1:]
1306
1307
                    if (not rule or
1308
                            ((rule[0] == '^') and
1309
                             ((i == 0) or not src[i-1].isalpha()) and
1310
                             ((rule[1:2] != '$') or
1311
                              (not (src[i+matches0:i+matches0+1].isalpha()) and
1312
                               (src[i+matches0:i+matches0+1] != '.')))) or
1313
                            ((rule[0] == '$') and (i > 0) and
1314
                             src[i-1].isalpha() and
1315
                             ((not src[i+matches0:i+matches0+1].isalpha()) and
1316
                              (src[i+matches0:i+matches0+1] != '.')))):
1317
                        # look for continuation, if:
1318
                        # matches > 1 und NO '-' in first string */
1319
                        pos0 = -1
1320
1321
                        start3 = 0
1322
                        start4 = 0
1323
                        end3 = 0
1324
                        end4 = 0
1325
1326
                        if (((matches > 1) and
1327
                             src[i+matches:i+matches+1] and
1328
                             (priority0 != ord('-')))):
1329
                            char0 = src[i+matches-1]
1330
                            pos0 = alpha_pos[char0]
1331
1332
                            if pos0 >= 2 and src[i+matches]:
1333
                                xpos = pos0 - 2
1334
                                pos0 = alpha_pos[src[i+matches]]
1335
                                start3 = phonet_hash_1[xpos, pos0]
1336
                                start4 = phonet_hash_1[xpos, 0]
1337
                                end3 = phonet_hash_2[xpos, pos0]
1338
                                end4 = phonet_hash_2[xpos, 0]
1339
1340
                                # preserve rule priorities
1341
                                if (((start4 >= 0) and
1342
                                     ((start3 < 0) or (start4 < start3)))):
1343
                                    pos0 = start3
1344
                                    start3 = start4
1345
                                    start4 = pos0
1346
                                    pos0 = end3
1347
                                    end3 = end4
1348
                                    end4 = pos0
1349
1350
                                if (end3 >= start4) and (start4 >= 0):
1351
                                    if end4 > end3:
1352
                                        end3 = end4
1353
1354
                                    start4 = -1
1355
                                    end4 = -1
1356
                            else:
1357
                                pos0 = phonet_hash[char0]
1358
                                start3 = pos0
1359
                                end3 = 10000
1360
                                start4 = -1
1361
                                end4 = -1
1362
1363
                            pos0 = start3
1364
1365
                        # check continuation rules for src[i+matches]
1366
                        if pos0 >= 0:
1367
                            while ((_phonet_rules[pos0] is None) or
1368
                                   (_phonet_rules[pos0][0] == char0)):
1369
                                if pos0 > end3:
1370
                                    if start4 > 0:
1371
                                        pos0 = start4
1372
                                        start3 = start4
1373
                                        start4 = -1
1374
                                        end3 = end4
1375
                                        end4 = -1
1376
                                        continue
1377
1378
                                    priority0 = -1
1379
1380
                                    # important
1381
                                    break
1382
1383
                                if (((_phonet_rules[pos0] is None) or
1384
                                     (_phonet_rules[pos0 + mode]
1385
                                      is None))):
1386
                                    # no conversion rule available
1387
                                    pos0 += 3
1388
                                    continue
1389
1390
                                # check whole string
1391
                                matches0 = matches
1392
                                priority0 = 5
1393
                                rule = _phonet_rules[pos0]
1394
                                rule = rule[1:]
1395
1396
                                while (rule and
1397
                                       (src[i+matches0:i+matches0+1] ==
1398
                                        rule[0]) and
1399
                                       (not rule[0].isdigit() or
1400
                                        (rule in '(-<^$'))):
1401
                                    matches0 += 1
1402
                                    rule = rule[1:]
1403
1404
                                if rule and rule[0] == '(':
1405
                                    # check an array of letters
1406
                                    if ((src[i+matches0:i+matches0+1]
1407
                                         .isalpha() and
1408
                                         (src[i+matches0] in rule[1:]))):
1409
                                        matches0 += 1
1410
1411
                                        while rule and rule[0] != ')':
1412
                                            rule = rule[1:]
1413
1414
                                        # if rule[0] == ')':
1415
                                        rule = rule[1:]
1416
1417
                                while rule and rule[0] == '-':
1418
                                    # "matches0" is NOT decremented
1419
                                    # because of  "if (matches0 == matches)"
1420
                                    rule = rule[1:]
1421
1422
                                if rule and rule[0] == '<':
1423
                                    rule = rule[1:]
1424
1425
                                if rule and rule[0].isdigit():
1426
                                    priority0 = int(rule[0])
1427
                                    rule = rule[1:]
1428
1429
                                if (not rule or
1430
                                        # rule == '^' is not possible here
1431
                                        ((rule[0] == '$') and not
1432
                                         src[i+matches0:i+matches0+1]
1433
                                         .isalpha() and
1434
                                         (src[i+matches0:i+matches0+1]
1435
                                          != '.'))):
1436
                                    if matches0 == matches:
1437
                                        # this is only a partial string
1438
                                        pos0 += 3
1439
                                        continue
1440
1441
                                    if priority0 < priority:
1442
                                        # priority is too low
1443
                                        pos0 += 3
1444
                                        continue
1445
1446
                                    # continuation rule found
1447
                                    break
1448
1449
                                pos0 += 3
1450
1451
                            # end of "while"
1452
                            if ((priority0 >= priority) and
1453
                                    ((_phonet_rules[pos0] is not None) and
1454
                                     (_phonet_rules[pos0][0] == char0))):
1455
1456
                                pos += 3
1457
                                continue
1458
1459
                        # replace string
1460
                        if ((_phonet_rules[pos] and
1461
                             ('<' in _phonet_rules[pos][1:]))):
1462
                            priority0 = 1
1463
                        else:
1464
                            priority0 = 0
1465
1466
                        rule = _phonet_rules[pos + mode]
1467
1468
                        if (priority0 == 1) and (zeta == 0):
1469
                            # rule with '<' is applied
1470
                            if ((j > 0) and rule and
1471
                                    ((dest[j-1] == char) or
1472
                                     (dest[j-1] == rule[0]))):
1473
                                j -= 1
1474
1475
                            zeta0 = 1
1476
                            zeta += 1
1477
                            matches0 = 0
1478
1479
                            while rule and src[i+matches0]:
1480
                                src = (src[0:i+matches0] + rule[0] +
1481
                                       src[i+matches0+1:])
1482
                                matches0 += 1
1483
                                rule = rule[1:]
1484
1485
                            if matches0 < matches:
1486
                                src = (src[0:i+matches0] +
1487
                                       src[i+matches:])
1488
1489
                            char = src[i]
1490
                        else:
1491
                            i = i + matches - 1
1492
                            zeta = 0
1493
1494
                            while len(rule) > 1:
1495
                                if (j == 0) or (dest[j - 1] != rule[0]):
1496
                                    dest = (dest[0:j] + rule[0] +
1497
                                            dest[min(len(dest), j+1):])
1498
                                    j += 1
1499
1500
                                rule = rule[1:]
1501
1502
                            # new "current char"
1503
                            if not rule:
1504
                                rule = ''
1505
                                char = ''
1506
                            else:
1507
                                char = rule[0]
1508
1509
                            if ((_phonet_rules[pos] and
1510
                                 '^^' in _phonet_rules[pos][1:])):
1511
                                if char:
1512
                                    dest = (dest[0:j] + char +
1513
                                            dest[min(len(dest), j + 1):])
1514
                                    j += 1
1515
1516
                                src = src[i + 1:]
1517
                                i = 0
1518
                                zeta0 = 1
1519
1520
                        break
1521
1522
                    pos += 3
1523
1524
                    if pos > end1 and start2 > 0:
1525
                        pos = start2
1526
                        start1 = start2
1527
                        end1 = end2
1528
                        start2 = -1
1529
                        end2 = -1
1530
1531
            if zeta0 == 0:
1532
                if char and ((j == 0) or (dest[j-1] != char)):
1533
                    # delete multiple letters only
1534
                    dest = dest[0:j] + char + dest[min(j+1, term_length):]
1535
                    j += 1
1536
1537
                i += 1
1538
                zeta = 0
1539
1540
        dest = dest[0:j]
1541
1542
        return dest
1543
1544
    _initialize_phonet(lang)
1545
1546
    word = unicode_normalize('NFKC', text_type(word))
1547
    return _phonet(word, mode, lang)
1548
1549
1550
if __name__ == '__main__':
1551
    import doctest
1552
    doctest.testmod()
1553