Completed
Push — master ( f43547...71985b )
by Chris
12:00 queued 10s
created

abydos.phonetic._beider_morse_data   B

Complexity

Total Complexity 0

Size/Duplication

Total Lines 6226
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
eloc 5347
dl 0
loc 6226
ccs 264
cts 264
cp 1
rs 8.8
c 0
b 0
f 0
wmc 0
1
# -*- coding: utf-8 -*-
0 ignored issues
show
coding-style introduced by
Too many lines in module (6225/1000)
Loading history...
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# This file is based on Alexander Beider and Stephen P. Morse's implementation
7
# of the Beider-Morse Phonetic Matching (BMPM) System, available at
8
# http://stevemorse.org/phonetics/bmpm.htm.
9
#
10
# Abydos is free software: you can redistribute it and/or modify
11
# it under the terms of the GNU General Public License as published by
12
# the Free Software Foundation, either version 3 of the License, or
13
# (at your option) any later version.
14
#
15
# Abydos is distributed in the hope that it will be useful,
16
# but WITHOUT ANY WARRANTY; without even the implied warranty of
17
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
# GNU General Public License for more details.
19
#
20
# You should have received a copy of the GNU General Public License
21
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
22
23 1
"""abydos.phonetic._beider_morse_data.
24
25
Behind-the-scenes constants, rules, etc. for the Beider-Morse Phonentic
26
Matching (BMPM) algorithm
27
28
DO NOT EDIT - This document is automatically generated from the reference
29
implementation in PHP.
30
"""
31
32 1
from __future__ import (
33
    absolute_import,
34
    division,
35
    print_function,
36
    unicode_literals,
37
)
38
39 1
L_NONE = 0
40 1
L_ANY = 2 ** 0
41 1
L_ARABIC = 2 ** 1
42 1
L_CYRILLIC = 2 ** 2
43 1
L_CZECH = 2 ** 3
44 1
L_DUTCH = 2 ** 4
45 1
L_ENGLISH = 2 ** 5
46 1
L_FRENCH = 2 ** 6
47 1
L_GERMAN = 2 ** 7
48 1
L_GREEK = 2 ** 8
49 1
L_GREEKLATIN = 2 ** 9
50 1
L_HEBREW = 2 ** 10
51 1
L_HUNGARIAN = 2 ** 11
52 1
L_ITALIAN = 2 ** 12
53 1
L_LATVIAN = 2 ** 13
54 1
L_POLISH = 2 ** 14
55 1
L_PORTUGUESE = 2 ** 15
56 1
L_ROMANIAN = 2 ** 16
57 1
L_RUSSIAN = 2 ** 17
58 1
L_SPANISH = 2 ** 18
59 1
L_TURKISH = 2 ** 19
60
61
# gen/approxany.php
62
63
# GENERIC
64
# A, E, I, O, P, U should create variants, but a, e, i, o, u should not create any new variant  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (108/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
65
# Q = ü ; Y = ä = ö
66
# EE = final "e" (english or french)
67
68 1
_GEN_APPROX_ANY = (
69
    # VOWELS
70
    # "ALL" DIPHTHONGS are interchangeable BETWEEN THEM and with monophthongs of which they are composed ("D" means "diphthong")  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (142/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
71
    # {a,o} are totally interchangeable if non-stressed; in German "a/o" can actually be from "ä/ö" (that are equivalent to "e")  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (142/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
72
    # {i,e} are interchangeable if non-stressed, while in German "u" can actually be from "ü" (that is equivalent to "i")  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (135/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
73
    ('mb', '', '', '(mb|b[512])'),
74
    ('mp', '', '', '(mp|b[512])'),
75
    ('ng', '', '', '(ng|g[512])'),
76
    ('B', '', '[fktSs]', '(p|f[262144])'),
77
    ('B', '', 'p', ''),
78
    ('B', '', '$', '(p|f[262144])'),
79
    ('V', '', '[pktSs]', '(f|p[262144])'),
80
    ('V', '', 'f', ''),
81
    ('V', '', '$', '(f|p[262144])'),
82
    ('B', '', '', '(b|v[262144])'),
83
    ('V', '', '', '(v|b[262144])'),
84
    # French word-final and word-part-final letters
85
    ('t', '', '$', '(t|[64])'),
86
    ('g', 'n', '$', '(g|[64])'),
87
    ('k', 'n', '$', '(k|[64])'),
88
    ('p', '', '$', '(p|[64])'),
89
    ('r', '[Ee]', '$', '(r|[64])'),
90
    ('s', '', '$', '(s|[64])'),
91
    ('t', '[aeiouAEIOU]', '[^aeiouAEIOU]', '(t|[64])'),  # Petitjean
92
    ('s', '[aeiouAEIOU]', '[^aeiouAEIOU]', '(s|[64])'),  # Groslot, Grosleau
93
    # ("p","[aeiouAEIOU]","[^aeiouAEIOU]","(p|[64])"),
94
    ('I', '[aeiouAEIBFOUQY]', '', 'i'),
95
    ('I', '', '[^aeiouAEBFIOU]e', '(Q[128]|i|D[32])'),  # "line"
96
    ('I', '', '$', 'i'),
97
    ('I', '', '[^k]$', 'i'),
98
    ('Ik', '[lr]', '$', '(ik|Qk[128])'),
99
    ('Ik', '', '$', 'ik'),
100
    ('sIts', '', '$', '(sits|sQts[128])'),
101
    ('Its', '', '$', 'its'),
102
    ('I', '', '', '(Q[128]|i)'),
103
    ('lEE', '[bdfgkmnprsStvzZ]', '', '(li|il[32])'),  # Apple = Appel
104
    ('rEE', '[bdfgkmnprsStvzZ]', '', '(ri|ir[32])'),
105
    (
106
        'lE',
107
        '[bdfgkmnprsStvzZ]',
108
        '',
109
        '(li|il[32]|lY[128])',
110
    ),  # Applebaum < Appelbaum  # noqa: E501
111
    ('rE', '[bdfgkmnprsStvzZ]', '', '(ri|ir[32]|rY[128])'),
112
    ('EE', '', '', '(i|)'),
113
    ('ea', '', '', '(D|a|i)'),
114
    ('au', '', '', '(D|a|u)'),
115
    ('ou', '', '', '(D|o|u)'),
116
    ('eu', '', '', '(D|e|u)'),
117
    ('ai', '', '', '(D|a|i)'),
118
    ('Ai', '', '', '(D|a|i)'),
119
    ('oi', '', '', '(D|o|i)'),
120
    ('Oi', '', '', '(D|o|i)'),
121
    ('ui', '', '', '(D|u|i)'),
122
    ('Ui', '', '', '(D|u|i)'),
123
    ('ei', '', '', '(D|i)'),
124
    ('Ei', '', '', '(D|i)'),
125
    ('iA', '', '$', '(ia|io)'),
126
    ('iA', '', '', '(ia|io|iY[128])'),
127
    ('A', '', '[^aeiouAEBFIOU]e', '(a|o|Y[128]|D[32])'),  # "plane"
128
    (
129
        'E',
130
        'i[^aeiouAEIOU]',
131
        '',
132
        '(i|Y[128]|[32])',
133
    ),  # Wineberg (vineberg/vajneberg) --> vajnberg  # noqa: E501
134
    (
135
        'E',
136
        'a[^aeiouAEIOU]',
137
        '',
138
        '(i|Y[128]|[32])',
139
    ),  # Shaneberg (shaneberg/shejneberg) --> shejnberg  # noqa: E501
140
    ('E', '', '[fklmnprst]$', 'i'),
141
    ('E', '', 'ts$', 'i'),
142
    ('E', '', '$', 'i'),
143
    ('E', '[DaoiuAOIUQY]', '', 'i'),
144
    ('E', '', '[aoAOQY]', 'i'),
145
    ('E', '', '', '(i|Y[128])'),
146
    ('P', '', '', '(o|u)'),
147
    ('O', '', '[fklmnprstv]$', 'o'),
148
    ('O', '', 'ts$', 'o'),
149
    ('O', '', '$', 'o'),
150
    ('O', '[oeiuQY]', '', 'o'),
151
    ('O', '', '', '(o|Y[128])'),
152
    ('O', '', '', 'o'),
153
    ('A', '', '[fklmnprst]$', '(a|o)'),
154
    ('A', '', 'ts$', '(a|o)'),
155
    ('A', '', '$', '(a|o)'),
156
    ('A', '[oeiuQY]', '', '(a|o)'),
157
    ('A', '', '', '(a|o|Y[128])'),
158
    ('A', '', '', '(a|o)'),
159
    ('U', '', '$', 'u'),
160
    ('U', '[DoiuQY]', '', 'u'),
161
    ('U', '', '[^k]$', 'u'),
162
    ('Uk', '[lr]', '$', '(uk|Qk[128])'),
163
    ('Uk', '', '$', 'uk'),
164
    ('sUts', '', '$', '(suts|sQts[128])'),
165
    ('Uts', '', '$', 'uts'),
166
    ('U', '', '', '(u|Q[128])'),
167
    ('U', '', '', 'u'),
168
    ('e', '', '[fklmnprstv]$', 'i'),
169
    ('e', '', 'ts$', 'i'),
170
    ('e', '', '$', 'i'),
171
    ('e', '[DaoiuAOIUQY]', '', 'i'),
172
    ('e', '', '[aoAOQY]', 'i'),
173
    ('e', '', '', '(i|Y[128])'),
174
    ('a', '', '', '(a|o)'),
175
)
176
177
# gen/approxarabic.php
178 1
_GEN_APPROX_ARABIC = (
179
    ('1a', '', '', '(D|a)'),
180
    ('1i', '', '', '(D|i|e)'),
181
    ('1u', '', '', '(D|u|o)'),
182
    ('j1', '', '', '(ja|je|jo|ju|j)'),
183
    ('1', '', '', '(a|e|i|o|u|)'),
184
    ('u', '', '', '(o|u)'),
185
    ('i', '', '', '(i|e)'),
186
    ('p', '', '$', 'p'),
187
    ('p', '', '', '(p|b)'),
188
)
189
190
# gen/approxcommon.php
191
192
# GENERIC
193
194 1
_GEN_APPROX_COMMON = (
195
    # DUTCH
196
    ('van', '^', '[bp]', '(vam|)'),
197
    ('van', '^', '', '(van|)'),
198
    # REGRESSIVE ASSIMILATION OF CONSONANTS
199
    ('n', '', '[bp]', 'm'),
200
    # PECULIARITY OF "h"
201
    ('h', '', '', ''),
202
    ('H', '', '', '(x|)'),
203
    # "e" and "i" ARE TO BE OMITTED BEFORE (SYLLABIC) n & l: Halperin=Halpern; Frankel = Frankl, Finkelstein = Finklstein  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (135/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
204
    # but Andersen & Anderson should match
205
    ('sen', '[rmnl]', '$', '(zn|zon)'),
206
    ('sen', '', '$', '(sn|son)'),
207
    ('sEn', '[rmnl]', '$', '(zn|zon)'),
208
    ('sEn', '', '$', '(sn|son)'),
209
    ('e', '[BbdfgklmnprsStvzZ]', '[ln]$', ''),
210
    ('i', '[BbdfgklmnprsStvzZ]', '[ln]$', ''),
211
    ('E', '[BbdfgklmnprsStvzZ]', '[ln]$', ''),
212
    ('I', '[BbdfgklmnprsStvzZ]', '[ln]$', ''),
213
    ('Q', '[BbdfgklmnprsStvzZ]', '[ln]$', ''),
214
    ('Y', '[BbdfgklmnprsStvzZ]', '[ln]$', ''),
215
    ('e', '[bdfgklmnprsStvzZ]', '[ln][bdfgklmnprsStvzZ]', '(e|)'),
216
    ('i', '[bdfgklmnprsStvzZ]', '[ln][bdfgklmnprsStvzZ]', '(i|)'),
217
    ('E', '[bdfgklmnprsStvzZ]', '[ln][bdfgklmnprsStvzZ]', '(E|)'),
218
    ('I', '[bdfgklmnprsStvzZ]', '[ln][bdfgklmnprsStvzZ]', '(I|)'),
219
    ('Q', '[bdfgklmnprsStvzZ]', '[ln][bdfgklmnprsStvzZ]', '(Q|)'),
220
    ('Y', '[bdfgklmnprsStvzZ]', '[ln][bdfgklmnprsStvzZ]', '(Y|)'),
221
    # ("e", "[BbdfgklmnprsStvzZ]", "[ln][BbdfgklmnprsStvzZ]", ""),
222
    # ("i", "[BbdfgklmnprsStvzZ]", "[ln][BbdfgklmnprsStvzZ]", ""),
223
    # ("E", "[BbdfgklmnprsStvzZ]", "[ln][BbdfgklmnprsStvzZ]", ""),
224
    # ("I", "[BbdfgklmnprsStvzZ]", "[ln][BbdfgklmnprsStvzZ]", ""),
225
    # ("Q", "[BbdfgklmnprsStvzZ]", "[ln][BbdfgklmnprsStvzZ]", ""),
226
    # ("Y", "[BbdfgklmnprsStvzZ]", "[ln][BbdfgklmnprsStvzZ]", ""),
227
    (
228
        'lEs',
229
        '',
230
        '',
231
        '(lEs|lz)',
232
    ),  # Applebaum < Appelbaum (English + blend English-something forms as Finklestein)  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (102/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
233
    (
234
        'lE',
235
        '[bdfgkmnprStvzZ]',
236
        '',
237
        '(lE|l)',
238
    ),  # Applebaum < Appelbaum (English + blend English-something forms as Finklestein)  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (102/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
239
    # SIMPLIFICATION: (TRIPHTHONGS & DIPHTHONGS) -> ONE GENERIC DIPHTHONG "D"
240
    ('aue', '', '', 'D'),
241
    ('oue', '', '', 'D'),
242
    ('AvE', '', '', '(D|AvE)'),
243
    ('Ave', '', '', '(D|Ave)'),
244
    ('avE', '', '', '(D|avE)'),
245
    ('ave', '', '', '(D|ave)'),
246
    ('OvE', '', '', '(D|OvE)'),
247
    ('Ove', '', '', '(D|Ove)'),
248
    ('ovE', '', '', '(D|ovE)'),
249
    ('ove', '', '', '(D|ove)'),
250
    ('ea', '', '', '(D|ea)'),
251
    ('EA', '', '', '(D|EA)'),
252
    ('Ea', '', '', '(D|Ea)'),
253
    ('eA', '', '', '(D|eA)'),
254
    ('aji', '', '', 'D'),
255
    ('ajI', '', '', 'D'),
256
    ('aje', '', '', 'D'),
257
    ('ajE', '', '', 'D'),
258
    ('Aji', '', '', 'D'),
259
    ('AjI', '', '', 'D'),
260
    ('Aje', '', '', 'D'),
261
    ('AjE', '', '', 'D'),
262
    ('oji', '', '', 'D'),
263
    ('ojI', '', '', 'D'),
264
    ('oje', '', '', 'D'),
265
    ('ojE', '', '', 'D'),
266
    ('Oji', '', '', 'D'),
267
    ('OjI', '', '', 'D'),
268
    ('Oje', '', '', 'D'),
269
    ('OjE', '', '', 'D'),
270
    ('eji', '', '', 'D'),
271
    ('ejI', '', '', 'D'),
272
    ('eje', '', '', 'D'),
273
    ('ejE', '', '', 'D'),
274
    ('Eji', '', '', 'D'),
275
    ('EjI', '', '', 'D'),
276
    ('Eje', '', '', 'D'),
277
    ('EjE', '', '', 'D'),
278
    ('uji', '', '', 'D'),
279
    ('ujI', '', '', 'D'),
280
    ('uje', '', '', 'D'),
281
    ('ujE', '', '', 'D'),
282
    ('Uji', '', '', 'D'),
283
    ('UjI', '', '', 'D'),
284
    ('Uje', '', '', 'D'),
285
    ('UjE', '', '', 'D'),
286
    ('iji', '', '', 'D'),
287
    ('ijI', '', '', 'D'),
288
    ('ije', '', '', 'D'),
289
    ('ijE', '', '', 'D'),
290
    ('Iji', '', '', 'D'),
291
    ('IjI', '', '', 'D'),
292
    ('Ije', '', '', 'D'),
293
    ('IjE', '', '', 'D'),
294
    ('aja', '', '', 'D'),
295
    ('ajA', '', '', 'D'),
296
    ('ajo', '', '', 'D'),
297
    ('ajO', '', '', 'D'),
298
    ('aju', '', '', 'D'),
299
    ('ajU', '', '', 'D'),
300
    ('Aja', '', '', 'D'),
301
    ('AjA', '', '', 'D'),
302
    ('Ajo', '', '', 'D'),
303
    ('AjO', '', '', 'D'),
304
    ('Aju', '', '', 'D'),
305
    ('AjU', '', '', 'D'),
306
    ('oja', '', '', 'D'),
307
    ('ojA', '', '', 'D'),
308
    ('ojo', '', '', 'D'),
309
    ('ojO', '', '', 'D'),
310
    ('Aju', '', '', 'D'),
311
    ('AjU', '', '', 'D'),
312
    ('Oja', '', '', 'D'),
313
    ('OjA', '', '', 'D'),
314
    ('Ojo', '', '', 'D'),
315
    ('OjO', '', '', 'D'),
316
    ('Aju', '', '', 'D'),
317
    ('AjU', '', '', 'D'),
318
    ('eja', '', '', 'D'),
319
    ('ejA', '', '', 'D'),
320
    ('ejo', '', '', 'D'),
321
    ('ejO', '', '', 'D'),
322
    ('Aju', '', '', 'D'),
323
    ('AjU', '', '', 'D'),
324
    ('Eja', '', '', 'D'),
325
    ('EjA', '', '', 'D'),
326
    ('Ejo', '', '', 'D'),
327
    ('EjO', '', '', 'D'),
328
    ('Aju', '', '', 'D'),
329
    ('AjU', '', '', 'D'),
330
    ('uja', '', '', 'D'),
331
    ('ujA', '', '', 'D'),
332
    ('ujo', '', '', 'D'),
333
    ('ujO', '', '', 'D'),
334
    ('Aju', '', '', 'D'),
335
    ('AjU', '', '', 'D'),
336
    ('Uja', '', '', 'D'),
337
    ('UjA', '', '', 'D'),
338
    ('Ujo', '', '', 'D'),
339
    ('UjO', '', '', 'D'),
340
    ('Aju', '', '', 'D'),
341
    ('AjU', '', '', 'D'),
342
    ('ija', '', '', 'D'),
343
    ('ijA', '', '', 'D'),
344
    ('ijo', '', '', 'D'),
345
    ('ijO', '', '', 'D'),
346
    ('Aju', '', '', 'D'),
347
    ('AjU', '', '', 'D'),
348
    ('Ija', '', '', 'D'),
349
    ('IjA', '', '', 'D'),
350
    ('Ijo', '', '', 'D'),
351
    ('IjO', '', '', 'D'),
352
    ('Aju', '', '', 'D'),
353
    ('AjU', '', '', 'D'),
354
    ('j', '', '', 'i'),
355
    # lander = lender = länder
356
    ('lYndEr', '', '$', 'lYnder'),
357
    ('lander', '', '$', 'lYnder'),
358
    ('lAndEr', '', '$', 'lYnder'),
359
    ('lAnder', '', '$', 'lYnder'),
360
    ('landEr', '', '$', 'lYnder'),
361
    ('lender', '', '$', 'lYnder'),
362
    ('lEndEr', '', '$', 'lYnder'),
363
    ('lendEr', '', '$', 'lYnder'),
364
    ('lEnder', '', '$', 'lYnder'),
365
    # burg = berg
366
    ('burk', '', '$', '(burk|berk)'),
367
    ('bUrk', '', '$', '(burk|berk)'),
368
    ('burg', '', '$', '(burk|berk)'),
369
    ('bUrg', '', '$', '(burk|berk)'),
370
    ('Burk', '', '$', '(burk|berk)'),
371
    ('BUrk', '', '$', '(burk|berk)'),
372
    ('Burg', '', '$', '(burk|berk)'),
373
    ('BUrg', '', '$', '(burk|berk)'),
374
    # CONSONANTS {z & Z; s & S} are approximately interchangeable
375
    ('s', '', '[rmnl]', 'z'),
376
    ('S', '', '[rmnl]', 'z'),
377
    ('s', '[rmnl]', '', 'z'),
378
    ('S', '[rmnl]', '', 'z'),
379
    ('dS', '', '$', 'S'),
380
    ('dZ', '', '$', 'S'),
381
    ('Z', '', '$', 'S'),
382
    ('S', '', '$', '(S|s)'),
383
    ('z', '', '$', '(S|s)'),
384
    ('S', '', '', 's'),
385
    ('dZ', '', '', 'z'),
386
    ('Z', '', '', 'z'),
387
)
388
389
# gen/approxcyrillic.php
390
# this file uses the same rules as approxrussian.php
391
392
# gen/approxczech.php
393
394
# this file uses the same rules as approxfrench.php
395
396
# gen/approxdutch.php
397
# this file uses the same rules as approxfrench.php
398
399
# gen/approxenglish.php
400 1
_GEN_APPROX_ENGLISH = (
401
    # VOWELS
402
    ('I', '', '[^aEIeiou]e', '(Q|i|D)'),  # like in "five"
403
    ('I', '', '$', 'i'),
404
    ('I', '[aEIeiou]', '', 'i'),
405
    ('I', '', '[^k]$', 'i'),
406
    ('Ik', '[lr]', '$', '(ik|Qk)'),
407
    ('Ik', '', '$', 'ik'),
408
    ('sIts', '', '$', '(sits|sQts)'),
409
    ('Its', '', '$', 'its'),
410
    ('I', '', '', '(i|Q)'),
411
    ('lE', '[bdfgkmnprsStvzZ]', '', '(il|li|lY)'),  # Applebaum < Appelbaum
412
    ('au', '', '', '(D|a|u)'),
413
    ('ou', '', '', '(D|o|u)'),
414
    ('ai', '', '', '(D|a|i)'),
415
    ('oi', '', '', '(D|o|i)'),
416
    ('ui', '', '', '(D|u|i)'),
417
    (
418
        'E',
419
        'D[^aeiEIou]',
420
        '',
421
        '(i|)',
422
    ),  # Weinberg, Shaneberg (shaneberg/shejneberg) --> shejnberg  # noqa: E501
423
    ('e', 'D[^aeiEIou]', '', '(i|)'),
424
    ('e', '', '', 'i'),
425
    ('E', '', '[fklmnprsStv]$', 'i'),
426
    ('E', '', 'ts$', 'i'),
427
    ('E', '[DaoiEuQY]', '', 'i'),
428
    ('E', '', '[aoQY]', 'i'),
429
    ('E', '', '', '(Y|i)'),
430
    ('a', '', '', '(a|o)'),
431
)
432
433
# gen/approxfrench.php
434
# GENERAL
435 1
_GEN_APPROX_FRENCH = (
436
    ('au', '', '', '(D|a|u)'),
437
    ('ou', '', '', '(D|o|u)'),
438
    ('ai', '', '', '(D|a|i)'),
439
    ('oi', '', '', '(D|o|i)'),
440
    ('ui', '', '', '(D|u|i)'),
441
    ('a', '', '', '(a|o)'),
442
    ('e', '', '', 'i'),
443
)
444
445
# gen/approxgerman.php
446
447 1
_GEN_APPROX_GERMAN = (
448
    ('I', '', '$', 'i'),
449
    ('I', '[aeiAEIOUouQY]', '', 'i'),
450
    ('I', '', '[^k]$', 'i'),
451
    ('Ik', '[lr]', '$', '(ik|Qk)'),
452
    ('Ik', '', '$', 'ik'),
453
    ('sIts', '', '$', '(sits|sQts)'),
454
    ('Its', '', '$', 'its'),
455
    ('I', '', '', '(Q|i)'),
456
    ('AU', '', '', '(D|a|u)'),
457
    ('aU', '', '', '(D|a|u)'),
458
    ('Au', '', '', '(D|a|u)'),
459
    ('au', '', '', '(D|a|u)'),
460
    ('ou', '', '', '(D|o|u)'),
461
    ('OU', '', '', '(D|o|u)'),
462
    ('oU', '', '', '(D|o|u)'),
463
    ('Ou', '', '', '(D|o|u)'),
464
    ('ai', '', '', '(D|a|i)'),
465
    ('Ai', '', '', '(D|a|i)'),
466
    ('oi', '', '', '(D|o|i)'),
467
    ('Oi', '', '', '(D|o|i)'),
468
    ('ui', '', '', '(D|u|i)'),
469
    ('Ui', '', '', '(D|u|i)'),
470
    ('e', '', '', 'i'),
471
    ('E', '', '[fklmnprst]$', 'i'),
472
    ('E', '', 'ts$', 'i'),
473
    ('E', '', '$', 'i'),
474
    ('E', '[DaoAOUiuQY]', '', 'i'),
475
    ('E', '', '[aoAOQY]', 'i'),
476
    ('E', '', '', '(Y|i)'),
477
    ('O', '', '$', 'o'),
478
    ('O', '', '[fklmnprst]$', 'o'),
479
    ('O', '', 'ts$', 'o'),
480
    ('O', '[aoAOUeiuQY]', '', 'o'),
481
    ('O', '', '', '(o|Y)'),
482
    ('a', '', '', '(a|o)'),
483
    ('A', '', '$', '(a|o)'),
484
    ('A', '', '[fklmnprst]$', '(a|o)'),
485
    ('A', '', 'ts$', '(a|o)'),
486
    ('A', '[aoeOUiuQY]', '', '(a|o)'),
487
    ('A', '', '', '(a|o|Y)'),
488
    ('U', '', '$', 'u'),
489
    ('U', '[DaoiuUQY]', '', 'u'),
490
    ('U', '', '[^k]$', 'u'),
491
    ('Uk', '[lr]', '$', '(uk|Qk)'),
492
    ('Uk', '', '$', 'uk'),
493
    ('sUts', '', '$', '(suts|sQts)'),
494
    ('Uts', '', '$', 'uts'),
495
    ('U', '', '', '(u|Q)'),
496
)
497
498
# gen/approxgreek.php
499
500
# this file uses the same rules as approxfrench.php
501
502
# gen/approxgreeklatin.php
503 1
_GEN_APPROX_GREEKLATIN = (('N', '', '', ''),)
504
505
# gen/approxhebrew.php
506 1
_GEN_APPROX_HEBREW = ()
507
508
# gen/approxhungarian.php
509
510
# this file uses the same rules as approxfrench.php
511
512
# gen/approxitalian.php
513
# this file uses the same rules as approxfrench.php
514
515
# gen/approxlatvian.php
516
# this file uses the same rules as approxfrench.php
517
518
# gen/approxpolish.php
519 1
_GEN_APPROX_POLISH = (
520
    ('aiB', '', '[bp]', '(D|Dm)'),
521
    ('oiB', '', '[bp]', '(D|Dm)'),
522
    ('uiB', '', '[bp]', '(D|Dm)'),
523
    ('eiB', '', '[bp]', '(D|Dm)'),
524
    ('EiB', '', '[bp]', '(D|Dm)'),
525
    ('iiB', '', '[bp]', '(D|Dm)'),
526
    ('IiB', '', '[bp]', '(D|Dm)'),
527
    ('aiB', '', '[dgkstvz]', '(D|Dn)'),
528
    ('oiB', '', '[dgkstvz]', '(D|Dn)'),
529
    ('uiB', '', '[dgkstvz]', '(D|Dn)'),
530
    ('eiB', '', '[dgkstvz]', '(D|Dn)'),
531
    ('EiB', '', '[dgkstvz]', '(D|Dn)'),
532
    ('iiB', '', '[dgkstvz]', '(D|Dn)'),
533
    ('IiB', '', '[dgkstvz]', '(D|Dn)'),
534
    ('B', '', '[bp]', '(o|om|im)'),
535
    ('B', '', '[dgkstvz]', '(o|on|in)'),
536
    ('B', '', '', 'o'),
537
    ('aiF', '', '[bp]', '(D|Dm)'),
538
    ('oiF', '', '[bp]', '(D|Dm)'),
539
    ('uiF', '', '[bp]', '(D|Dm)'),
540
    ('eiF', '', '[bp]', '(D|Dm)'),
541
    ('EiF', '', '[bp]', '(D|Dm)'),
542
    ('iiF', '', '[bp]', '(D|Dm)'),
543
    ('IiF', '', '[bp]', '(D|Dm)'),
544
    ('aiF', '', '[dgkstvz]', '(D|Dn)'),
545
    ('oiF', '', '[dgkstvz]', '(D|Dn)'),
546
    ('uiF', '', '[dgkstvz]', '(D|Dn)'),
547
    ('eiF', '', '[dgkstvz]', '(D|Dn)'),
548
    ('EiF', '', '[dgkstvz]', '(D|Dn)'),
549
    ('iiF', '', '[dgkstvz]', '(D|Dn)'),
550
    ('IiF', '', '[dgkstvz]', '(D|Dn)'),
551
    ('F', '', '[bp]', '(i|im|om)'),
552
    ('F', '', '[dgkstvz]', '(i|in|on)'),
553
    ('F', '', '', 'i'),
554
    ('P', '', '', '(o|u)'),
555
    ('I', '', '$', 'i'),
556
    ('I', '', '[^k]$', 'i'),
557
    ('Ik', '[lr]', '$', '(ik|Qk)'),
558
    ('Ik', '', '$', 'ik'),
559
    ('sIts', '', '$', '(sits|sQts)'),
560
    ('Its', '', '$', 'its'),
561
    ('I', '[aeiAEBFIou]', '', 'i'),
562
    ('I', '', '', '(i|Q)'),
563
    ('au', '', '', '(D|a|u)'),
564
    ('ou', '', '', '(D|o|u)'),
565
    ('ai', '', '', '(D|a|i)'),
566
    ('oi', '', '', '(D|o|i)'),
567
    ('ui', '', '', '(D|u|i)'),
568
    ('a', '', '', '(a|o)'),
569
    ('e', '', '', 'i'),
570
    ('E', '', '[fklmnprst]$', 'i'),
571
    ('E', '', 'ts$', 'i'),
572
    ('E', '', '$', 'i'),
573
    ('E', '[DaoiuQ]', '', 'i'),
574
    ('E', '', '[aoQ]', 'i'),
575
    ('E', '', '', '(Y|i)'),
576
)
577
578
# gen/approxportuguese.php
579
580
# this file uses the same rules as approxfrench.php
581
582
# gen/approxromanian.php
583
# this file uses the same rules as approxpolish.php
584
585
# gen/approxrussian.php
586
587 1
_GEN_APPROX_RUSSIAN = (
588
    # VOWELS
589
    ('I', '', '$', 'i'),
590
    ('I', '', '[^k]$', 'i'),
591
    ('Ik', '[lr]', '$', '(ik|Qk)'),
592
    ('Ik', '', '$', 'ik'),
593
    ('sIts', '', '$', '(sits|sQts)'),
594
    ('Its', '', '$', 'its'),
595
    ('I', '[aeiEIou]', '', 'i'),
596
    ('I', '', '', '(i|Q)'),
597
    ('au', '', '', '(D|a|u)'),
598
    ('ou', '', '', '(D|o|u)'),
599
    ('ai', '', '', '(D|a|i)'),
600
    ('oi', '', '', '(D|o|i)'),
601
    ('ui', '', '', '(D|u|i)'),
602
    ('om', '', '[bp]', '(om|im)'),
603
    ('on', '', '[dgkstvz]', '(on|in)'),
604
    ('em', '', '[bp]', '(im|om)'),
605
    ('en', '', '[dgkstvz]', '(in|on)'),
606
    ('Em', '', '[bp]', '(im|Ym|om)'),
607
    ('En', '', '[dgkstvz]', '(in|Yn|on)'),
608
    ('a', '', '', '(a|o)'),
609
    ('e', '', '', 'i'),
610
    ('E', '', '[fklmnprsStv]$', 'i'),
611
    ('E', '', 'ts$', 'i'),
612
    ('E', '[DaoiuQ]', '', 'i'),
613
    ('E', '', '[aoQ]', 'i'),
614
    ('E', '', '', '(Y|i)'),
615
)
616
617
# gen/approxspanish.php
618
619 1
_GEN_APPROX_SPANISH = (('B', '', '', '(b|v)'), ('V', '', '', '(b|v)'))
620
621
# gen/approxturkish.php
622
# this file uses the same rules as approxfrench.php
623
624
# gen/exactany.php
625
# GENERAL
626
# A, E, I, O, P, U should create variants,
627
# EE = final "e" (english & french)
628
# V, B from Spanish
629
# but a, e, i, o, u should not create any new variant
630 1
_GEN_EXACT_ANY = (
631
    ('EE', '', '$', 'e'),
632
    ('A', '', '', 'a'),
633
    ('E', '', '', 'e'),
634
    ('I', '', '', 'i'),
635
    ('O', '', '', 'o'),
636
    ('P', '', '', 'o'),
637
    ('U', '', '', 'u'),
638
    ('B', '', '[fktSs]', 'p'),
639
    ('B', '', 'p', ''),
640
    ('B', '', '$', 'p'),
641
    ('V', '', '[pktSs]', 'f'),
642
    ('V', '', 'f', ''),
643
    ('V', '', '$', 'f'),
644
    ('B', '', '', 'b'),
645
    ('V', '', '', 'v'),
646
)
647
648
# gen/exactapproxcommon.php
649
# GENERAL
650 1
_GEN_EXACT_APPROX_COMMON = (
651
    ('h', '', '$', ''),
652
    # VOICED - UNVOICED CONSONANTS
653
    ('b', '', '[fktSs]', 'p'),
654
    ('b', '', 'p', ''),
655
    ('b', '', '$', 'p'),
656
    ('p', '', '[vgdZz]', 'b'),  # Ashk: "v" excluded (everythere)
657
    ('p', '', 'b', ''),
658
    ('v', '', '[pktSs]', 'f'),
659
    ('v', '', 'f', ''),
660
    ('v', '', '$', 'f'),
661
    ('f', '', '[vbgdZz]', 'v'),
662
    ('f', '', 'v', ''),
663
    ('g', '', '[pftSs]', 'k'),
664
    ('g', '', 'k', ''),
665
    ('g', '', '$', 'k'),
666
    ('k', '', '[vbdZz]', 'g'),
667
    ('k', '', 'g', ''),
668
    ('d', '', '[pfkSs]', 't'),
669
    ('d', '', 't', ''),
670
    ('d', '', '$', 't'),
671
    ('t', '', '[vbgZz]', 'd'),
672
    ('t', '', 'd', ''),
673
    ('s', '', 'dZ', ''),
674
    ('s', '', 'tS', ''),
675
    ('z', '', '[pfkSt]', 's'),
676
    ('z', '', '[sSzZ]', ''),
677
    ('s', '', '[sSzZ]', ''),
678
    ('Z', '', '[sSzZ]', ''),
679
    ('S', '', '[sSzZ]', ''),
680
    # SIMPLIFICATION OF CONSONANT CLUSTERS
681
    ('jnm', '', '', 'jm'),
682
    # DOUBLE --> SINGLE
683
    ('ji', '^', '', 'i'),
684
    ('jI', '^', '', 'I'),
685
    ('a', '', '[aA]', ''),
686
    ('a', 'A', '', ''),
687
    ('A', '', 'A', ''),
688
    ('b', '', 'b', ''),
689
    ('d', '', 'd', ''),
690
    ('f', '', 'f', ''),
691
    ('g', '', 'g', ''),
692
    ('j', '', 'j', ''),
693
    ('k', '', 'k', ''),
694
    ('l', '', 'l', ''),
695
    ('m', '', 'm', ''),
696
    ('n', '', 'n', ''),
697
    ('p', '', 'p', ''),
698
    ('r', '', 'r', ''),
699
    ('t', '', 't', ''),
700
    ('v', '', 'v', ''),
701
    ('z', '', 'z', '')
702
    # do not put name of file here since it always gets merged into another file  # noqa: E501
703
)
704
705
# gen/exactarabic.php
706 1
_GEN_EXACT_ARABIC = (('1', '', '', ''),)
707
708
# gen/exactcommon.php
709
# GENERAL
710
711 1
_GEN_EXACT_COMMON = (
712
    ('H', '', '', ''),
713
    # VOICED - UNVOICED CONSONANTS
714
    ('s', '[^t]', '[bgZd]', 'z'),
715
    ('Z', '', '[pfkst]', 'S'),
716
    ('Z', '', '$', 'S'),
717
    ('S', '', '[bgzd]', 'Z'),
718
    ('z', '', '$', 's'),
719
    ('ji', '[aAoOeEiIuU]', '', 'j'),
720
    ('jI', '[aAoOeEiIuU]', '', 'j'),
721
    ('je', '[aAoOeEiIuU]', '', 'j'),
722
    ('jE', '[aAoOeEiIuU]', '', 'j'),
723
)
724
725
# gen/exactcyrillic.php
726
# this file uses the same rules as exactrussian.php
727
728
# gen/exactczech.php
729
# this file uses the same rules as exactrussian.php
730
731
# gen/exactdutch.php
732 1
_GEN_EXACT_DUTCH = ()
733
734
# gen/exactenglish.php
735
# this file uses the same rules as exactrussian.php
736
737
# gen/exactfrench.php
738
# GENERAL
739 1
_GEN_EXACT_FRENCH = ()
740
741
# gen/exactgerman.php
742
# this file uses the same rules as exactany.php
743
744
# gen/exactgreek.php
745 1
_GEN_EXACT_GREEK = ()
746
747
# gen/exactgreeklatin.php
748 1
_GEN_EXACT_GREEKLATIN = (('N', '', '', 'n'),)
749
750
# gen/exacthebrew.php
751 1
_GEN_EXACT_HEBREW = ()
752
753
# gen/exacthungarian.php
754
# this file uses the same rules as exactrussian.php
755
756
# gen/exactitalian.php
757
# GENERAL
758 1
_GEN_EXACT_ITALIAN = ()
759
760
# gen/exactlatvian.php
761
# GENERAL
762 1
_GEN_EXACT_LATVIAN = ()
763
764
# gen/exactpolish.php
765 1
_GEN_EXACT_POLISH = (
766
    ('B', '', '', 'a'),
767
    ('F', '', '', 'e'),
768
    ('P', '', '', 'o'),
769
    ('E', '', '', 'e'),
770
    ('I', '', '', 'i'),
771
)
772
773
# gen/exactportuguese.php
774
# GENERAL
775 1
_GEN_EXACT_PORTUGUESE = ()
776
777
# gen/exactromanian.php
778
# this file uses the same rules as exactrussian.php
779
780
# gen/exactrussian.php
781 1
_GEN_EXACT_RUSSIAN = (('E', '', '', 'e'), ('I', '', '', 'i'))
782
783
# gen/exactspanish.php
784
# GENERAL
785 1
_GEN_EXACT_SPANISH = (('B', '', '', 'b'), ('V', '', '', 'v'))
786
787
# gen/exactturkish.php
788 1
_GEN_EXACT_TURKISH = ()
789
790
# gen/hebrewcommon.php
791
# GENERAL
792
793 1
_GEN_HEBREW_COMMON = (
794
    ('ts', '', '', 'C'),  # for not confusion Gutes [=guts] and Guts [=guc]
795
    ('tS', '', '', 'C'),  # same reason
796
    ('S', '', '', 's'),
797
    ('p', '', '', 'f'),
798
    ('b', '^', '', 'b'),
799
    ('b', '', '', '(b|v)'),
800
    ('B', '', '', '(b|v)'),  # Spanish "b"
801
    ('V', '', '', 'v'),  # Spanish "v"
802
    ('EE', '', '', '(1|)'),  # final "e" (english & french)
803
    ('ja', '', '', 'i'),
804
    ('jA', '', '', 'i'),
805
    ('je', '', '', 'i'),
806
    ('jE', '', '', 'i'),
807
    ('aj', '', '', 'i'),
808
    ('Aj', '', '', 'i'),
809
    ('I', '', '', 'i'),
810
    ('j', '', '', 'i'),
811
    ('a', '^', '', '1'),
812
    ('A', '^', '', '1'),
813
    ('e', '^', '', '1'),
814
    ('E', '^', '', '1'),
815
    ('Y', '^', '', '1'),
816
    ('a', '', '$', '1'),
817
    ('A', '', '$', '1'),
818
    ('e', '', '$', '1'),
819
    ('E', '', '$', '1'),
820
    ('Y', '', '$', '1'),
821
    ('a', '', '', ''),
822
    ('A', '', '', ''),
823
    ('e', '', '', ''),
824
    ('E', '', '', ''),
825
    ('Y', '', '', ''),
826
    ('oj', '^', '', '(u|vi)'),
827
    ('Oj', '^', '', '(u|vi)'),
828
    ('uj', '^', '', '(u|vi)'),
829
    ('Uj', '^', '', '(u|vi)'),
830
    ('oj', '', '', 'u'),
831
    ('Oj', '', '', 'u'),
832
    ('uj', '', '', 'u'),
833
    ('Uj', '', '', 'u'),
834
    ('ou', '^', '', '(u|v|1)'),
835
    ('o', '^', '', '(u|v|1)'),
836
    ('O', '^', '', '(u|v|1)'),
837
    ('P', '^', '', '(u|v|1)'),
838
    ('U', '^', '', '(u|v|1)'),
839
    ('u', '^', '', '(u|v|1)'),
840
    ('o', '', '$', '(u|1)'),
841
    ('O', '', '$', '(u|1)'),
842
    ('P', '', '$', '(u|1)'),
843
    ('u', '', '$', '(u|1)'),
844
    ('U', '', '$', '(u|1)'),
845
    ('ou', '', '', 'u'),
846
    ('o', '', '', 'u'),
847
    ('O', '', '', 'u'),
848
    ('P', '', '', 'u'),
849
    ('U', '', '', 'u'),
850
    ('VV', '', '', 'u'),  # alef/ayin + vov from ruleshebrew
851
    ('V', '', '', 'v'),  # tsvey-vov from ruleshebrew;; only Ashkenazic
852
    ('L', '^', '', '1'),  # alef/ayin from  ruleshebrew
853
    ('L', '', '$', '1'),  # alef/ayin from  ruleshebrew
854
    ('L', '', '', ''),  # alef/ayin from  ruleshebrew
855
    ('WW', '^', '', '(vi|u)'),  # vav-yod from  ruleshebrew
856
    ('WW', '', '', 'u'),  # vav-yod from  ruleshebrew
857
    ('W', '^', '', '(u|v)'),  # vav from  ruleshebrew
858
    ('W', '', '', 'u'),  # vav from  ruleshebrew
859
    # ("g","","","(g|Z)"),
860
    # ("z","","","(z|Z)"),
861
    # ("d","","","(d|dZ)"),
862
    ('TB', '^', '', 't'),  # tav from ruleshebrew
863
    ('TB', '', '', '(t|s)'),  # tav from ruleshebrew; s is only Ashkenazic
864
    ('T', '', '', 't'),  # tet from  ruleshebrew
865
    # ("k","","","(k|x)"),
866
    # ("x","","","(k|x)"),
867
    ('K', '', '', 'k'),  # kof and initial kaf from ruleshebrew
868
    ('X', '', '', 'x'),  # khet and final kaf from ruleshebrew
869
    ('H', '^', '', '(x|1)'),
870
    ('H', '', '$', '(x|1)'),
871
    ('H', '', '', '(x|)'),
872
    ('h', '^', '', '1'),
873
    ('h', '', '', ''),
874
)
875
876
# gen/lang.php
877
# GENERIC
878
879
# format of entries in $languageRules table is
880
# (pattern, language, Acceptance)
881
# where
882
# pattern is a regular expression
883
# e.g., ^ means start of word, $ Means End Of Word, [^ei] means anything but e or i, etc.  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (103/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
884
# language is one or more of the languages defined above separated by + signs
885
# acceptance is true or false
886
# meaning is:
887
# if "pattern" matches and acceptance is true, name is in one of the languages indicated and no others  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (116/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
888
# if "pattern" matches and acceptance is false, name is not in any of the languages indicated  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (107/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
889
890 1
_GEN_LANGUAGE_RULES = (
891
    # 1. following are rules to accept the language
892
    # 1.1 Special letter combinations
893
    ('^o’', 32, True),
894
    ('^o\'', 32, True),
895
    ('^mc', 32, True),
896
    ('^fitz', 32, True),
897
    ('ceau', 65600, True),
898
    ('eau', 65536, True),
899
    ('ault$', 64, True),
900
    ('oult$', 64, True),
901
    ('eux$', 64, True),
902
    ('eix$', 64, True),
903
    ('glou$', 512, True),
904
    ('uu', 16, True),
905
    ('tx', 262144, True),
906
    ('witz', 128, True),
907
    ('tz$', 131232, True),
908
    ('^tz', 131104, True),
909
    ('poulos$', 512, True),
910
    ('pulos$', 512, True),
911
    ('iou', 512, True),
912
    ('sj$', 16, True),
913
    ('^sj', 16, True),
914
    ('güe', 262144, True),
915
    ('güi', 262144, True),
916
    ('ghe', 66048, True),
917
    ('ghi', 66048, True),
918
    ('escu$', 65536, True),
919
    ('esco$', 65536, True),
920
    ('vici$', 65536, True),
921
    ('schi$', 65536, True),
922
    ('ii$', 131072, True),
923
    ('iy$', 131072, True),
924
    ('yy$', 131072, True),
925
    ('yi$', 131072, True),
926
    ('^rz', 16384, True),
927
    ('rz$', 16512, True),
928
    ('[bcdfgklmnpstwz]rz', 16384, True),
929
    ('rz[bcdfghklmnpstw]', 16384, True),
930
    ('cki$', 16384, True),
931
    ('ska$', 16384, True),
932
    ('cka$', 16384, True),
933
    ('ae', 131232, True),
934
    ('oe', 131312, True),
935
    ('th$', 160, True),
936
    ('^th', 672, True),
937
    ('mann', 128, True),
938
    ('cz', 16384, True),
939
    ('cy', 16896, True),
940
    ('niew', 16384, True),
941
    ('etti$', 4096, True),
942
    ('eti$', 4096, True),
943
    ('ati$', 4096, True),
944
    ('ato$', 4096, True),
945
    ('[aoei]no$', 4096, True),
946
    ('[aoei]ni$', 4096, True),
947
    ('esi$', 4096, True),
948
    ('oli$', 4096, True),
949
    ('field$', 32, True),
950
    ('stein', 128, True),
951
    ('heim$', 128, True),
952
    ('heimer$', 128, True),
953
    ('thal', 128, True),
954
    ('zweig', 128, True),
955
    ('[aeou]h', 128, True),
956
    ('äh', 128, True),
957
    ('öh', 128, True),
958
    ('üh', 128, True),
959
    ('[ln]h[ao]$', 32768, True),
960
    ('[ln]h[aou]', 819416, True),
961
    ('chsch', 128, True),
962
    ('tsch', 128, True),
963
    ('sch$', 131200, True),
964
    ('^sch', 131200, True),
965
    ('ck$', 160, True),
966
    ('c$', 608264, True),
967
    ('sz', 18432, True),
968
    ('cs$', 2048, True),
969
    ('^cs', 2048, True),
970
    ('dzs', 2048, True),
971
    ('zs$', 2048, True),
972
    ('^zs', 2048, True),
973
    ('^wl', 16384, True),
974
    ('^wr', 16560, True),
975
    ('gy$', 2048, True),
976
    ('gy[aeou]', 2048, True),
977
    ('gy', 133696, True),
978
    ('guy', 64, True),
979
    ('gu[ei]', 294976, True),
980
    ('gu[ao]', 294912, True),
981
    ('gi[aou]', 4608, True),
982
    ('ly', 150016, True),
983
    ('ny', 412160, True),
984
    ('ty', 150016, True),
985
    # 1.2 special characters
986
    ('ā', 8192, True),
987
    ('ć', 16384, True),
988
    ('ç', 819264, True),
989
    ('č', 8200, True),
990
    ('ď', 8, True),
991
    ('ē', 8192, True),
992
    ('ğ', 524288, True),
993
    ('ģ', 8192, True),
994
    ('ī', 8192, True),
995
    ('ķ', 8192, True),
996
    ('ļ', 8192, True),
997
    ('ł', 16384, True),
998
    ('ņ', 8192, True),
999
    ('ń', 16384, True),
1000
    ('ñ', 262144, True),
1001
    ('ň', 8, True),
1002
    ('ř', 8, True),
1003
    ('ś', 16384, True),
1004
    ('ş', 589824, True),
1005
    ('š', 8200, True),
1006
    ('ţ', 65536, True),
1007
    ('ť', 8, True),
1008
    ('ź', 16384, True),
1009
    ('ž', 8200, True),
1010
    ('ż', 16384, True),
1011
    ('ß', 128, True),
1012
    ('ä', 128, True),
1013
    ('á', 297480, True),
1014
    ('â', 98368, True),
1015
    ('ă', 65536, True),
1016
    ('ą', 16384, True),
1017
    ('à', 32768, True),
1018
    ('ã', 32768, True),
1019
    ('ę', 16384, True),
1020
    ('é', 2632, True),
1021
    ('è', 266304, True),
1022
    ('ê', 64, True),
1023
    ('ě', 8, True),
1024
    ('ê', 32832, True),
1025
    ('í', 297480, True),
1026
    ('î', 65600, True),
1027
    ('ı', 524288, True),
1028
    ('ó', 317960, True),
1029
    ('ö', 526464, True),
1030
    ('ô', 32832, True),
1031
    ('õ', 34816, True),
1032
    ('ò', 266240, True),
1033
    ('ű', 2048, True),
1034
    ('ú', 297480, True),
1035
    ('ü', 821376, True),
1036
    ('ù', 64, True),
1037
    ('ů', 8, True),
1038
    ('ý', 520, True),
1039
    # Every Cyrillic word has at least one Cyrillic vowel (аёеоиуыэюя)
1040
    ('а', 4, True),
1041
    ('ё', 4, True),
1042
    ('о', 4, True),
1043
    ('е', 4, True),
1044
    ('и', 4, True),
1045
    ('у', 4, True),
1046
    ('ы', 4, True),
1047
    ('э', 4, True),
1048
    ('ю', 4, True),
1049
    ('я', 4, True),
1050
    # Every Greek word has at least one Greek vowel
1051
    ('α', 256, True),
1052
    ('ε', 256, True),
1053
    ('η', 256, True),
1054
    ('ι', 256, True),
1055
    ('ο', 256, True),
1056
    ('υ', 256, True),
1057
    ('ω', 256, True),
1058
    # Arabic (only initial)
1059
    ('ا', 2, True),  # alif (isol + init)
1060
    ('ب', 2, True),  # ba'
1061
    ('ت', 2, True),  # ta'
1062
    ('ث', 2, True),  # tha'
1063
    ('ج', 2, True),  # jim
1064
    ('ح', 2, True),  # h.a'
1065
    ('خ\'', 2, True),  # kha'
1066
    ('د', 2, True),  # dal (isol + init)
1067
    ('ذ', 2, True),  # dhal (isol + init)
1068
    ('ر', 2, True),  # ra' (isol + init)
1069
    ('ز', 2, True),  # za' (isol + init)
1070
    ('س', 2, True),  # sin
1071
    ('ش', 2, True),  # shin
1072
    ('ص', 2, True),  # s.ad
1073
    ('ض', 2, True),  # d.ad
1074
    ('ط', 2, True),  # t.a'
1075
    ('ظ', 2, True),  # z.a'
1076
    ('ع', 2, True),  # 'ayn
1077
    ('غ', 2, True),  # ghayn
1078
    ('ف', 2, True),  # fa'
1079
    ('ق', 2, True),  # qaf
1080
    ('ك', 2, True),  # kaf
1081
    ('ل', 2, True),  # lam
1082
    ('م', 2, True),  # mim
1083
    ('ن', 2, True),  # nun
1084
    ('ه', 2, True),  # ha'
1085
    ('و', 2, True),  # waw (isol + init)
1086
    ('ي', 2, True),  # ya'
1087
    ('آ', 2, True),  # alif madda
1088
    ('إ', 2, True),  # alif + diacritic
1089
    ('أ', 2, True),  # alif + hamza
1090
    ('ؤ', 2, True),  # waw + hamza
1091
    ('ئ', 2, True),  # ya' + hamza
1092
    # ("لا‎", 2, True), # ligature l+a
1093
    # Hebrew
1094
    ('א', 1024, True),
1095
    ('ב', 1024, True),
1096
    ('ג', 1024, True),
1097
    ('ד', 1024, True),
1098
    ('ה', 1024, True),
1099
    ('ו', 1024, True),
1100
    ('ז', 1024, True),
1101
    ('ח', 1024, True),
1102
    ('ט', 1024, True),
1103
    ('י', 1024, True),
1104
    ('כ', 1024, True),
1105
    ('ל', 1024, True),
1106
    ('מ', 1024, True),
1107
    ('נ', 1024, True),
1108
    ('ס', 1024, True),
1109
    ('ע', 1024, True),
1110
    ('פ', 1024, True),
1111
    ('צ', 1024, True),
1112
    ('ק', 1024, True),
1113
    ('ר', 1024, True),
1114
    ('ש', 1024, True),
1115
    ('ת', 1024, True),
1116
    # 2. following are rules to reject the language
1117
    # Every Latin character word has at least one Latin vowel
1118
    ('a', 1286, False),
1119
    ('o', 1286, False),
1120
    ('e', 1286, False),
1121
    ('i', 1286, False),
1122
    ('y', 75030, False),
1123
    ('u', 1286, False),
1124
    ('j', 4096, False),
1125
    ('j[^aoeiuy]', 295488, False),
1126
    ('g', 8, False),
1127
    ('k', 364608, False),
1128
    ('q', 748056, False),
1129
    ('v', 16384, False),
1130
    ('w', 993864, False),
1131
    ('x', 534552, False),  # polish excluded from the list
1132
    ('dj', 786432, False),
1133
    (
1134
        'v[^aoeiu]',
1135
        128,
1136
        False,
1137
    ),  # in german, "v" can be found before a vowel only  # noqa: E501
1138
    (
1139
        'y[^aoeiu]',
1140
        128,
1141
        False,
1142
    ),  # in german, "y" usually appears only in the last position; sometimes before a vowel  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (106/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
1143
    ('c[^aohk]', 128, False),
1144
    ('dzi', 524512, False),
1145
    ('ou', 128, False),
1146
    ('a[eiou]', 524288, False),  # no diphthongs in Turkish
1147
    ('ö[eaiou]', 524288, False),
1148
    ('ü[eaiou]', 524288, False),
1149
    ('e[aiou]', 524288, False),
1150
    ('i[aeou]', 524288, False),
1151
    ('o[aieu]', 524288, False),
1152
    ('u[aieo]', 524288, False),
1153
    ('aj', 240, False),
1154
    ('ej', 240, False),
1155
    ('oj', 240, False),
1156
    ('uj', 240, False),
1157
    ('eu', 147456, False),
1158
    ('ky', 16384, False),
1159
    ('kie', 262720, False),
1160
    ('gie', 360960, False),
1161
    ('ch[aou]', 4096, False),
1162
    ('ch', 524288, False),
1163
    ('son$', 128, False),
1164
    ('sc[ei]', 64, False),
1165
    ('sch', 280640, False),
1166
    ('^h', 131072, False),
1167
)
1168
1169
# gen/languagenames.php
1170 1
_GEN_LANGUAGES = (
1171
    'any',
1172
    'arabic',
1173
    'cyrillic',
1174
    'czech',
1175
    'dutch',
1176
    'english',
1177
    'french',
1178
    'german',
1179
    'greek',  # noqa: E501
1180
    'greeklatin',
1181
    'hebrew',
1182
    'hungarian',
1183
    'italian',
1184
    'latvian',
1185
    'polish',
1186
    'portuguese',  # noqa: E501
1187
    'romanian',
1188
    'russian',
1189
    'spanish',
1190
    'turkish',
1191
)
1192
1193
# gen/rulesany.php
1194
# format of each entry rule in the table
1195
# (pattern, left context, right context, phonetic)
1196
# where
1197
# pattern is a sequence of characters that might appear in the word to be transliterated  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (102/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
1198
# left context is the context that precedes the pattern
1199
# right context is the context that follows the pattern
1200
# phonetic is the result that this rule generates
1201
#
1202
# note that both left context and right context can be regular expressions
1203
# ex: left context of ^ would mean start of word
1204
# left context of [aeiouy] means following a vowel
1205
# right context of [^aeiouy] means preceding a consonant
1206
# right context of e$ means preceding a final e
1207
1208
# GENERIC
1209 1
_GEN_RULES_ANY = (
1210
    # CONVERTING FEMININE TO MASCULINE
1211
    ('yna', '', '$', '(in[131072]|ina)'),
1212
    ('ina', '', '$', '(in[131072]|ina)'),
1213
    ('liova', '', '$', '(lova|lof[131072]|lef[131072])'),
1214
    ('lova', '', '$', '(lova|lof[131072]|lef[131072]|l[8]|el[8])'),
1215
    ('kova', '', '$', '(kova|kof[131072]|k[8]|ek[8])'),
1216
    ('ova', '', '$', '(ova|of[131072]|[8])'),
1217
    ('ová', '', '$', '(ova|[8])'),
1218
    ('eva', '', '$', '(eva|ef[131072])'),
1219
    ('aia', '', '$', '(aja|i[131072])'),
1220
    ('aja', '', '$', '(aja|i[131072])'),
1221
    ('aya', '', '$', '(aja|i[131072])'),
1222
    ('lowa', '', '$', '(lova|lof[16384]|l[16384]|el[16384])'),
1223
    ('kowa', '', '$', '(kova|kof[16384]|k[16384]|ek[16384])'),
1224
    ('owa', '', '$', '(ova|of[16384]|)'),
1225
    ('lowna', '', '$', '(lovna|levna|l[16384]|el[16384])'),
1226
    ('kowna', '', '$', '(kovna|k[16384]|ek[16384])'),
1227
    ('owna', '', '$', '(ovna|[16384])'),
1228
    ('lówna', '', '$', '(l|el)'),  # polish
1229
    ('kówna', '', '$', '(k|ek)'),  # polish
1230
    ('ówna', '', '$', ''),  # polish
1231
    ('á', '', '$', '(a|i[8])'),
1232
    ('a', '', '$', '(a|i[16392])'),
1233
    # CONSONANTS
1234
    ('pf', '', '', '(pf|p|f)'),
1235
    ('que', '', '$', '(k[64]|ke|kve)'),
1236
    ('qu', '', '', '(kv|k)'),
1237
    ('m', '', '[bfpv]', '(m|n)'),
1238
    ('m', '[aeiouy]', '[aeiouy]', 'm'),
1239
    ('m', '[aeiouy]', '', '(m|n[32832])'),  # nasal
1240
    ('ly', '', '[au]', 'l'),
1241
    ('li', '', '[au]', 'l'),
1242
    ('lio', '', '', '(lo|le[131072])'),
1243
    ('lyo', '', '', '(lo|le[131072])'),
1244
    # ("ll","","","(l|J[262144])"),  # Disabled Argentinian rule
1245
    ('lt', 'u', '$', '(lt|[64])'),
1246
    ('v', '^', '', '(v|f[128]|b[262144])'),
1247
    ('ex', '', '[aáuiíoóeéêy]', '(ez[32768]|eS[32768]|eks|egz)'),
1248
    ('ex', '', '[cs]', '(e[32768]|ek)'),
1249
    ('x', 'u', '$', '(ks|[64])'),
1250
    ('ck', '', '', '(k|tsk[16392])'),
1251
    ('cz', '', '', '(tS|tsz[8])'),  # Polish
1252
    # Proceccing of "h" in various combinations
1253
    ('rh', '^', '', 'r'),
1254
    ('dh', '^', '', 'd'),
1255
    ('bh', '^', '', 'b'),
1256
    ('ph', '', '', '(ph|f)'),
1257
    ('kh', '', '', '(x[131104]|kh)'),
1258
    ('lh', '', '', '(lh|l[32768])'),
1259
    ('nh', '', '', '(nh|nj[32768])'),
1260
    ('ssch', '', '', 'S'),  # german
1261
    ('chsch', '', '', 'xS'),  # german
1262
    ('tsch', '', '', 'tS'),  # german
1263
    # ("desch","^","","deS"),
1264
    # ("desh","^","","(dES|de[64])"),
1265
    # ("des","^","[^aeiouy]","(dEs|de[64])"),
1266
    ('sch', '[aeiouy]', '[ei]', '(S|StS[131072]|sk[69632])'),
1267
    ('sch', '[aeiouy]', '', '(S|StS[131072])'),
1268
    ('sch', '', '[ei]', '(sk[69632]|S|StS[131072])'),
1269
    ('sch', '', '', '(S|StS[131072])'),
1270
    ('ssh', '', '', 'S'),
1271
    ('sh', '', '[äöü]', 'sh'),  # german
1272
    ('sh', '', '[aeiou]', '(S[131104]|sh)'),
1273
    ('sh', '', '', 'S'),
1274
    ('zh', '', '', '(Z[131104]|zh|tsh[128])'),
1275
    ('chs', '', '', '(ks[128]|xs|tSs[131104])'),
1276
    ('ch', '', '[ei]', '(x|tS[393248]|k[69632]|S[32832])'),
1277
    ('ch', '', '', '(x|tS[393248]|S[32832])'),
1278
    ('th', '^', '', 't'),  # english+german+greeklatin
1279
    ('th', '', '[äöüaeiou]', '(t[672]|th)'),
1280
    ('th', '', '', 't'),  # english+german+greeklatin
1281
    ('gh', '', '[ei]', '(g[70144]|gh)'),
1282
    ('ouh', '', '[aioe]', '(v[64]|uh)'),
1283
    ('uh', '', '[aioe]', '(v|uh)'),
1284
    ('h', '', '$', ''),
1285
    ('h', '[aeiouyäöü]', '', ''),  # 128
1286
    ('h', '^', '', '(h|x[66048]|H[381024])'),
1287
    # Processing of "ci", "ce" & "cy"
1288
    ('cia', '', '', '(tSa[16384]|tsa)'),  # Polish
1289
    ('cią', '', '[bp]', '(tSom|tsom)'),  # Polish
1290
    ('cią', '', '', '(tSon[16384]|tson)'),  # Polish
1291
    ('cię', '', '[bp]', '(tSem[16384]|tsem)'),  # Polish
1292
    ('cię', '', '', '(tSen[16384]|tsen)'),  # Polish
1293
    ('cie', '', '', '(tSe[16384]|tse)'),  # Polish
1294
    ('cio', '', '', '(tSo[16384]|tso)'),  # Polish
1295
    ('ciu', '', '', '(tSu[16384]|tsu)'),  # Polish
1296
    (
1297
        'sci',
1298
        '',
1299
        '$',
1300
        '(Si[4096]|stsi[16392]|dZi[524288]|tSi[81920]|tS[65536]|si)',
1301
    ),  # noqa: E501
1302
    ('sc', '', '[ei]', '(S[4096]|sts[16392]|dZ[524288]|tS[81920]|s)'),
1303
    ('ci', '', '$', '(tsi[16392]|dZi[524288]|tSi[81920]|tS[65536]|si)'),
1304
    ('cy', '', '', '(si|tsi[16384])'),
1305
    ('c', '', '[ei]', '(ts[16392]|dZ[524288]|tS[81920]|k[512]|s)'),
1306
    # Processing of "s"
1307
    ('sç', '', '[aeiou]', '(s|stS[524288])'),
1308
    ('ssz', '', '', 'S'),  # polish
1309
    ('sz', '^', '', '(S|s[2048])'),  # polish
1310
    ('sz', '', '$', '(S|s[2048])'),  # polish
1311
    ('sz', '', '', '(S|s[2048]|sts[128])'),  # polish
1312
    ('ssp', '', '', '(Sp[128]|sp)'),
1313
    ('sp', '', '', '(Sp[128]|sp)'),
1314
    ('sst', '', '', '(St[128]|st)'),
1315
    ('st', '', '', '(St[128]|st)'),
1316
    ('ss', '', '', 's'),
1317
    ('sj', '^', '', 'S'),  # dutch
1318
    ('sj', '', '$', 'S'),  # dutch
1319
    ('sj', '', '', '(sj|S[16]|sx[262144]|sZ[589824])'),
1320
    ('sia', '', '', '(Sa[16384]|sa[16384]|sja)'),
1321
    ('sią', '', '[bp]', '(Som[16384]|som)'),  # polish
1322
    ('sią', '', '', '(Son[16384]|son)'),  # polish
1323
    ('się', '', '[bp]', '(Sem[16384]|sem)'),  # polish
1324
    ('się', '', '', '(Sen[16384]|sen)'),  # polish
1325
    ('sie', '', '', '(se|sje|Se[16384]|zi[128])'),
1326
    ('sio', '', '', '(So[16384]|so)'),
1327
    ('siu', '', '', '(Su[16384]|sju)'),
1328
    ('si', '[äöüaáuiíoóeéêy]', '', '(Si[16384]|si|zi[37056])'),
1329
    ('si', '', '', '(Si[16384]|si|zi[128])'),
1330
    ('s', '[aáuiíoóeéêy]', '[aáuíoóeéêy]', '(s|z[37056])'),
1331
    ('s', '', '[aeouäöü]', '(s|z[128])'),
1332
    ('s', '[aeiouy]', '[dglmnrv]', '(s|z|Z[32768]|[64])'),  # Groslot
1333
    ('s', '', '[dglmnrv]', '(s|z|Z[32768])'),
1334
    # Processing of "g"
1335
    ('gue', '', '$', '(k[64]|gve)'),  # portuguese+spanish
1336
    ('gu', '', '[ei]', '(g[64]|gv[294912])'),  # portuguese+spanish
1337
    ('gu', '', '[ao]', 'gv'),  # portuguese+spanish
1338
    ('guy', '', '', 'gi'),  # french
1339
    ('gli', '', '', '(glI|l[4096])'),
1340
    ('gni', '', '', '(gnI|ni[4160])'),
1341
    ('gn', '', '[aeou]', '(n[4160]|nj[4160]|gn)'),
1342
    ('ggie', '', '', '(je[512]|dZe)'),  # dZ is Italian
1343
    ('ggi', '', '[aou]', '(j[512]|dZ)'),  # dZ is Italian
1344
    ('ggi', '[yaeiou]', '[aou]', '(gI|dZ[4096]|j[512])'),
1345
    ('gge', '[yaeiou]', '', '(gE|xe[262144]|gZe[32832]|dZe[331808]|je[512])'),
1346
    ('ggi', '[yaeiou]', '', '(gI|xi[262144]|gZi[32832]|dZi[331808]|i[512])'),
1347
    ('ggi', '', '[aou]', '(gI|dZ[4096]|j[512])'),
1348
    ('gie', '', '$', '(ge|gi[128]|ji[64]|dZe[4096])'),
1349
    ('gie', '', '', '(ge|gi[128]|dZe[4096]|je[512])'),
1350
    ('gi', '', '[aou]', '(i[512]|dZ)'),  # dZ is Italian
1351
    ('ge', '[yaeiou]', '', '(gE|xe[262144]|Ze[32832]|dZe[331808])'),
1352
    ('gi', '[yaeiou]', '', '(gI|xi[262144]|Zi[32832]|dZi[331808])'),
1353
    ('ge', '', '', '(gE|xe[262144]|hE[131072]|je[512]|Ze[32832]|dZe[331808])'),
1354
    ('gi', '', '', '(gI|xi[262144]|hI[131072]|i[512]|Zi[32832]|dZi[331808])'),
1355
    ('gy', '', '[aeouáéóúüöőű]', '(gi|dj[2048])'),
1356
    ('gy', '', '', '(gi|d[2048])'),
1357
    ('g', '[yaeiou]', '[aouyei]', 'g'),
1358
    ('g', '', '[aouei]', '(g|h[131072])'),
1359
    # Processing of "j"
1360
    ('ij', '', '', '(i|ej[16]|ix[262144]|iZ[622656])'),
1361
    ('j', '', '[aoeiuy]', '(j|dZ[32]|x[262144]|Z[622656])'),
1362
    # Processing of "z"
1363
    ('rz', 't', '', '(S[16384]|r)'),  # polish
1364
    ('rz', '', '', '(rz|rts[128]|Z[16384]|r[16384]|rZ[16384])'),
1365
    ('tz', '', '$', '(ts|tS[160])'),
1366
    ('tz', '^', '', '(ts[131232]|tS[160])'),
1367
    ('tz', '', '', '(ts[131232]|tz)'),
1368
    ('zia', '', '[bcdgkpstwzż]', '(Za[16384]|za[16384]|zja)'),
1369
    ('zia', '', '', '(Za[16384]|zja)'),
1370
    ('zią', '', '[bp]', '(Zom[16384]|zom)'),  # polish
1371
    ('zią', '', '', '(Zon[16384]|zon)'),  # polish
1372
    ('zię', '', '[bp]', '(Zem[16384]|zem)'),  # polish
1373
    ('zię', '', '', '(Zen[16384]|zen)'),  # polish
1374
    ('zie', '', '[bcdgkpstwzż]', '(Ze[16384]|ze[16384]|ze|tsi[128])'),
1375
    ('zie', '', '', '(ze|Ze[16384]|tsi[128])'),
1376
    ('zio', '', '', '(Zo[16384]|zo)'),
1377
    ('ziu', '', '', '(Zu[16384]|zju)'),
1378
    ('zi', '', '', '(Zi[16384]|zi|tsi[128]|dzi[4096]|tsi[4096]|si[262144])'),
1379
    (
1380
        'z',
1381
        '',
1382
        '$',
1383
        '(s|ts[128]|ts[4096]|S[32768])',
1384
    ),  # ts It, s/S/Z Port, s in Sp, z Fr  # noqa: E501
1385
    ('z', '', '[bdgv]', '(z|dz[4096]|Z[32768])'),  # dz It, Z/z Port, z Sp & Fr
1386
    ('z', '', '[ptckf]', '(s|ts[4096]|S[32768])'),  # ts It, s/S/z Port, z/s Sp
1387
    # VOWELS
1388
    ('aue', '', '', 'aue'),
1389
    ('oue', '', '', '(oue|ve[64])'),
1390
    ('eau', '', '', 'o'),  # French
1391
    ('ae', '', '', '(Y[128]|aje[131072]|ae)'),
1392
    ('ai', '', '', 'aj'),
1393
    ('au', '', '', '(au|o[64])'),
1394
    ('ay', '', '', 'aj'),
1395
    ('ão', '', '', '(au|an)'),  # Port
1396
    ('ãe', '', '', '(aj|an)'),  # Port
1397
    ('ãi', '', '', '(aj|an)'),  # Port
1398
    ('ea', '', '', '(ea|ja[65536])'),
1399
    ('ee', '', '', '(i[32]|aje[131072]|e)'),
1400
    ('ei', '', '', '(aj|ej)'),
1401
    ('eu', '', '', '(eu|Yj[128]|ej[128]|oj[128]|Y[16])'),
1402
    ('ey', '', '', '(aj|ej)'),
1403
    ('ia', '', '', 'ja'),
1404
    # ("ie","","","(Die[8192]|i[128]|e[16384]|ije[131072]|Q[16]|je)"), # Beider_2017  # noqa: E501
1405
    ('ie', '', '', '(i[128]|e[16384]|ije[131072]|Q[16]|je)'),
1406
    ('ii', '', '$', 'i'),  # russian
1407
    ('io', '', '', '(jo|e[131072])'),
1408
    # ("iu","","","(Diu[8192]|ju)"), # Beider_2017
1409
    ('iu', '', '', 'ju'),
1410
    ('iy', '', '$', 'i'),  # russian
1411
    ('oe', '', '', '(Y[128]|oje[131072]|u[16]|oe)'),
1412
    ('oi', '', '', 'oj'),
1413
    ('oo', '', '', '(u[32]|o)'),
1414
    ('ou', '', '', '(ou|u[576]|au[16])'),
1415
    ('où', '', '', 'u'),  # french
1416
    ('oy', '', '', 'oj'),
1417
    ('õe', '', '', '(oj|on)'),  # Port
1418
    ('ua', '', '', 'va'),
1419
    ('ue', '', '', '(Q[128]|uje[131072]|ve)'),
1420
    # ("ui","","","(Dui[8192]|uj|vi|Y[16])"), # Beider_2017
1421
    ('ui', '', '', '(uj|vi|Y[16])'),
1422
    ('uu', '', '', '(u|Q[16])'),
1423
    ('uo', '', '', '(vo|o)'),
1424
    ('uy', '', '', 'uj'),
1425
    ('ya', '', '', 'ja'),
1426
    ('ye', '', '', '(je|ije[131072])'),
1427
    ('yi', '^', '', 'i'),
1428
    ('yi', '', '$', 'i'),  # russian
1429
    ('yo', '', '', '(jo|e[131072])'),
1430
    ('yu', '', '', 'ju'),
1431
    ('yy', '', '$', 'i'),  # russian
1432
    ('i', '[áóéê]', '', 'j'),
1433
    ('y', '[áóéê]', '', 'j'),
1434
    ('e', '^', '', '(e|je[131072])'),
1435
    ('e', '', '$', '(e|EE[96])'),
1436
    # LANGUAGE SPECIFIC CHARACTERS
1437
    ('ą', '', '[bp]', 'om'),  # polish
1438
    ('ą', '', '', 'on'),  # polish
1439
    ('ä', '', '', '(Y|e)'),
1440
    ('á', '', '', 'a'),  # Port & Sp
1441
    ('à', '', '', 'a'),
1442
    ('â', '', '', 'a'),
1443
    ('ã', '', '', '(a|an)'),  # Port
1444
    ('ă', '', '', '(e[65536]|a)'),  # romanian
1445
    ('ā', '', '', 'a'),  # latvian
1446
    ('č', '', '', 'tS'),  # czech + latvian
1447
    ('ć', '', '', '(tS[16384]|ts)'),  # polish
1448
    ('ç', '', '', '(s|tS[524288])'),
1449
    ('ď', '', '', '(d|dj[8])'),
1450
    ('ę', '', '[bp]', 'em'),  # polish
1451
    ('ę', '', '', 'en'),  # polish
1452
    ('é', '', '', 'e'),
1453
    ('è', '', '', 'e'),
1454
    ('ê', '', '', 'e'),
1455
    ('ě', '', '', '(e|je[8])'),
1456
    ('ē', '', '', 'e'),  # latvian
1457
    ('ģ', '', '', '(d|dj)'),  # latvian
1458
    ('ğ', '', '', ''),  # turkish
1459
    ('í', '', '', 'i'),
1460
    ('î', '', '', 'i'),
1461
    ('ī', '', '', 'i'),  # latvian
1462
    ('ı', '', '', '(i|e[524288]|[524288])'),
1463
    ('ķ', '', '', '(k|t[8192]|tj[8192])'),  # latvian
1464
    # ("ļ","","","(l|lj)"), # latvian Beider_2017
1465
    ('ļ', '', '', 'l'),  # latvian
1466
    ('ł', '', '', 'l'),
1467
    ('ń', '', '', '(n|nj[16384])'),  # polish
1468
    ('ñ', '', '', '(n|nj[262144])'),
1469
    ('ņ', '', '', '(n|nj[8192])'),
1470
    ('ó', '', '', '(u[16384]|o)'),
1471
    ('ô', '', '', 'o'),  # Port & Fr
1472
    ('õ', '', '', '(o|on[32768]|Y[2048])'),
1473
    ('ò', '', '', 'o'),  # Sp & It
1474
    ('ö', '', '', 'Y'),
1475
    ('ř', '', '', '(r|rZ[8])'),
1476
    ('ś', '', '', '(S[16384]|s)'),
1477
    ('ş', '', '', 'S'),  # romanian+turkish
1478
    ('š', '', '', 'S'),  # czech + latvian
1479
    ('ţ', '', '', 'ts'),  # romanian
1480
    ('ť', '', '', '(t|tj[8])'),
1481
    ('ű', '', '', 'Q'),  # hungarian
1482
    ('ü', '', '', '(Q|u[294912])'),
1483
    ('ū', '', '', 'u'),  # latvian
1484
    ('ú', '', '', 'u'),
1485
    ('ů', '', '', 'u'),  # czech
1486
    ('ù', '', '', 'u'),  # french
1487
    ('ý', '', '', 'i'),  # czech
1488
    ('ż', '', '', 'Z'),  # polish
1489
    ('ź', '', '', '(Z[16384]|z)'),
1490
    ('ž', '', '', 'Z'),  # czech + latvian
1491
    ('ß', '', '', 's'),  # german
1492
    ('\'', '', '', ''),  # russian
1493
    ('"', '', '', ''),  # russian
1494
    ('o', '', '[bcćdgklłmnńrsśtwzźż]', '(O|P[16384])'),
1495
    # LATIN ALPHABET
1496
    ('a', '', '', 'A'),
1497
    ('b', '', '', 'B'),
1498
    ('c', '', '', '(k|ts[16392]|dZ[524288])'),
1499
    ('d', '', '', 'd'),
1500
    ('e', '', '', 'E'),
1501
    ('f', '', '', 'f'),
1502
    # ("g","","","(g|x[16])"), # Dutch sound disabled
1503
    ('g', '', '', 'g'),
1504
    ('h', '', '', '(h|x[65536]|H[299072])'),
1505
    ('i', '', '', 'I'),
1506
    ('j', '', '', '(j|x[262144]|Z[622656])'),
1507
    ('k', '', '', 'k'),
1508
    ('l', '', '', 'l'),
1509
    ('m', '', '', 'm'),
1510
    ('n', '', '', 'n'),
1511
    ('o', '', '', 'O'),
1512
    ('p', '', '', 'p'),
1513
    ('q', '', '', 'k'),
1514
    ('r', '', '', 'r'),
1515
    ('s', '', '', '(s|S[32768])'),
1516
    ('t', '', '', 't'),
1517
    ('u', '', '', 'U'),
1518
    ('v', '', '', 'V'),
1519
    ('w', '', '', '(v|w[48])'),
1520
    ('x', '', '', '(ks|gz|S[294912])'),  # S/ks Port & Sp, gz Sp, It only ks
1521
    ('y', '', '', 'i'),
1522
    (
1523
        'z',
1524
        '',
1525
        '',
1526
        '(z|ts[128]|dz[4096]|ts[4096]|s[262144])',
1527
    ),  # ts/dz It, z Port & Fr, z/s Sp  # noqa: E501
1528
)
1529
1530
# gen/rulesarabic.php
1531
1532
# General
1533 1
_GEN_RULES_ARABIC = (
1534
    ('ا', '', '', 'a'),  # alif isol & init
1535
    ('ب', '', '$', 'b'),
1536
    ('ب', '', '', 'b1'),  # ba' isol
1537
    ('ت', '', '$', 't'),
1538
    ('ت', '', '', 't1'),  # ta' isol
1539
    ('ث', '', '$', 't'),
1540
    ('ث', '', '', 't1'),  # tha' isol
1541
    ('ج', '', '$', '(dZ|Z)'),
1542
    ('ج', '', '', '(dZ1|Z1)'),  # jim isol
1543
    ('ح', '^', '', '1'),
1544
    ('ح', '', '$', '1'),
1545
    ('ح', '', '', '(h1|1)'),  # h.a' isol
1546
    ('خ', '', '$', 'x'),
1547
    ('خ', '', '', 'x1'),  # kha' isol
1548
    ('د', '', '$', 'd'),
1549
    ('د', '', '', 'd1'),  # dal isol & init
1550
    ('ذ', '', '$', 'd'),
1551
    ('ذ', '', '', 'd1'),  # dhal isol & init
1552
    ('ر', '', '$', 'r'),
1553
    ('ر', '', '', 'r1'),  # ra' isol & init
1554
    ('ز', '', '$', 'z'),
1555
    ('ز', '', '', 'z1'),  # za' isol & init
1556
    ('س', '', '$', 's'),
1557
    ('س', '', '', 's1'),  # sin isol
1558
    ('ش', '', '$', 'S'),
1559
    ('ش', '', '', 'S1'),  # shin isol
1560
    ('ص', '', '$', 's'),
1561
    ('ص', '', '', 's1'),  # s.ad isol
1562
    ('ض', '', '$', 'd'),
1563
    ('ض', '', '', 'd1'),  # d.ad isol
1564
    ('ط', '', '$', 't'),
1565
    ('ط', '', '', 't1'),  # t.a' isol
1566
    ('ظ', '', '$', 'z'),
1567
    ('ظ', '', '', 'z1'),  # z.a' isol
1568
    ('ع', '^', '', '1'),
1569
    ('ع', '', '$', '1'),
1570
    ('ع', '', '', '(h1|1)'),  # ayin isol
1571
    ('غ', '', '$', 'g'),
1572
    ('غ', '', '', 'g1'),  # ghayin isol
1573
    ('ف', '', '$', 'f'),
1574
    ('ف', '', '', 'f1'),  # fa' isol
1575
    ('ق', '', '$', 'k'),
1576
    ('ق', '', '', 'k1'),  # qaf isol
1577
    ('ك', '', '$', 'k'),
1578
    ('ك', '', '', 'k1'),  # kaf isol
1579
    ('ل', '', '$', 'l'),
1580
    ('ل', '', '', 'l1'),  # lam isol
1581
    ('م', '', '$', 'm'),
1582
    ('م', '', '', 'm1'),  # mim isol
1583
    ('ن', '', '$', 'n'),
1584
    ('ن', '', '', 'n1'),  # nun isol
1585
    ('ه', '^', '', '1'),
1586
    ('ه', '', '$', '1'),
1587
    ('ه', '', '', '(h1|1)'),  # h isol
1588
    ('و', '', '$', '(u|v)'),
1589
    ('و', '', '', '(u|v1)'),  # waw, isol + init
1590
    ('ي‎', '', '$', '(i|j)'),
1591
    ('ي‎', '', '', '(i|j1)'),  # ya' isol
1592
)
1593
1594
# gen/rulescyrillic.php
1595
1596
# GENERAL
1597 1
_GEN_RULES_CYRILLIC = (
1598
    ('ця', '', '', 'tsa'),
1599
    ('цю', '', '', 'tsu'),
1600
    ('циа', '', '', 'tsa'),
1601
    ('цие', '', '', 'tse'),
1602
    ('цио', '', '', 'tso'),
1603
    ('циу', '', '', 'tsu'),
1604
    ('сие', '', '', 'se'),
1605
    ('сио', '', '', 'so'),
1606
    ('зие', '', '', 'ze'),
1607
    ('зио', '', '', 'zo'),
1608
    ('с', '', 'с', ''),
1609
    ('гауз', '', '$', 'haus'),
1610
    ('гаус', '', '$', 'haus'),
1611
    ('гольц', '', '$', 'holts'),
1612
    ('геймер', '', '$', '(hejmer|hajmer)'),
1613
    ('гейм', '', '$', '(hejm|hajm)'),
1614
    ('гоф', '', '$', 'hof'),
1615
    ('гер', '', '$', 'ger'),
1616
    ('ген', '', '$', 'gen'),
1617
    ('гин', '', '$', 'gin'),
1618
    ('г', '(й|ё|я|ю|ы|а|е|о|и|у)', '(а|е|о|и|у)', 'g'),
1619
    ('г', '', '(а|е|о|и|у)', '(g|h)'),
1620
    ('ля', '', '', 'la'),
1621
    ('лю', '', '', 'lu'),
1622
    ('лё', '', '', '(le|lo)'),
1623
    ('лио', '', '', '(le|lo)'),
1624
    ('ле', '', '', '(lE|lo)'),
1625
    ('ийе', '', '', 'je'),
1626
    ('ие', '', '', 'je'),
1627
    ('ыйе', '', '', 'je'),
1628
    ('ые', '', '', 'je'),
1629
    ('ий', '', '(а|о|у)', 'j'),
1630
    ('ый', '', '(а|о|у)', 'j'),
1631
    ('ий', '', '$', 'i'),
1632
    ('ый', '', '$', 'i'),
1633
    ('ей', '^', '', '(jej|ej)'),
1634
    ('е', '(а|е|о|у)', '', 'je'),
1635
    ('е', '^', '', 'je'),
1636
    ('эй', '', '', 'ej'),
1637
    ('ей', '', '', 'ej'),
1638
    ('ауе', '', '', 'aue'),
1639
    ('ауэ', '', '', 'aue'),
1640
    ('а', '', '', 'a'),
1641
    ('б', '', '', 'b'),
1642
    ('в', '', '', 'v'),
1643
    ('г', '', '', 'g'),
1644
    ('д', '', '', 'd'),
1645
    ('е', '', '', 'E'),
1646
    ('ё', '', '', '(e|jo)'),
1647
    ('ж', '', '', 'Z'),
1648
    ('з', '', '', 'z'),
1649
    ('и', '', '', 'I'),
1650
    ('й', '', '', 'j'),
1651
    ('к', '', '', 'k'),
1652
    ('л', '', '', 'l'),
1653
    ('м', '', '', 'm'),
1654
    ('н', '', '', 'n'),
1655
    ('о', '', '', 'o'),
1656
    ('п', '', '', 'p'),
1657
    ('р', '', '', 'r'),
1658
    ('с', '', '', 's'),
1659
    ('т', '', '', 't'),
1660
    ('у', '', '', 'u'),
1661
    ('ф', '', '', 'f'),
1662
    ('х', '', '', 'x'),
1663
    ('ц', '', '', 'ts'),
1664
    ('ч', '', '', 'tS'),
1665
    ('ш', '', '', 'S'),
1666
    ('щ', '', '', 'StS'),
1667
    ('ъ', '', '', ''),
1668
    ('ы', '', '', 'I'),
1669
    ('ь', '', '', ''),
1670
    ('э', '', '', 'E'),
1671
    ('ю', '', '', 'ju'),
1672
    ('я', '', '', 'ja'),
1673
)
1674
1675
# gen/rulesczech.php
1676
1677 1
_GEN_RULES_CZECH = (
1678
    ('ch', '', '', 'x'),
1679
    ('qu', '', '', '(k|kv)'),
1680
    ('aue', '', '', 'aue'),
1681
    ('ei', '', '', '(ej|aj)'),
1682
    ('i', '[aou]', '', 'j'),
1683
    ('i', '', '[aeou]', 'j'),
1684
    ('č', '', '', 'tS'),
1685
    ('š', '', '', 'S'),
1686
    ('ž', '', '', 'Z'),
1687
    ('ň', '', '', 'n'),
1688
    ('ť', '', '', '(t|tj)'),
1689
    ('ď', '', '', '(d|dj)'),
1690
    ('ř', '', '', '(r|rZ)'),
1691
    ('á', '', '', 'a'),
1692
    ('é', '', '', 'e'),
1693
    ('í', '', '', 'i'),
1694
    ('ó', '', '', 'o'),
1695
    ('ú', '', '', 'u'),
1696
    ('ý', '', '', 'i'),
1697
    ('ě', '', '', '(e|je)'),
1698
    ('ů', '', '', 'u'),
1699
    # LATIN ALPHABET
1700
    ('a', '', '', 'a'),
1701
    ('b', '', '', 'b'),
1702
    ('c', '', '', 'ts'),
1703
    ('d', '', '', 'd'),
1704
    ('e', '', '', 'E'),
1705
    ('f', '', '', 'f'),
1706
    ('g', '', '', 'g'),
1707
    ('h', '', '', '(h|g)'),
1708
    ('i', '', '', 'I'),
1709
    ('j', '', '', 'j'),
1710
    ('k', '', '', 'k'),
1711
    ('l', '', '', 'l'),
1712
    ('m', '', '', 'm'),
1713
    ('n', '', '', 'n'),
1714
    ('o', '', '', 'o'),
1715
    ('p', '', '', 'p'),
1716
    ('q', '', '', '(k|kv)'),
1717
    ('r', '', '', 'r'),
1718
    ('s', '', '', 's'),
1719
    ('t', '', '', 't'),
1720
    ('u', '', '', 'u'),
1721
    ('v', '', '', 'v'),
1722
    ('w', '', '', 'v'),
1723
    ('x', '', '', 'ks'),
1724
    ('y', '', '', 'i'),
1725
    ('z', '', '', 'z'),
1726
)
1727
1728
# gen/rulesdutch.php
1729
1730 1
_GEN_RULES_DUTCH = (
1731
    # CONSONANTS
1732
    ('ssj', '', '', 'S'),
1733
    ('sj', '', '', 'S'),
1734
    ('ch', '', '', 'x'),
1735
    ('c', '', '[eiy]', 'ts'),
1736
    ('ck', '', '', 'k'),  # German
1737
    ('pf', '', '', '(pf|p|f)'),  # German
1738
    ('ph', '', '', '(ph|f)'),
1739
    ('qu', '', '', 'kv'),
1740
    ('th', '^', '', 't'),  # German
1741
    ('th', '', '[äöüaeiou]', '(t|th)'),  # German
1742
    ('th', '', '', 't'),  # German
1743
    ('ss', '', '', 's'),
1744
    ('h', '[aeiouy]', '', ''),
1745
    # VOWELS
1746
    ('aue', '', '', 'aue'),
1747
    ('ou', '', '', 'au'),
1748
    ('ie', '', '', '(Q|i)'),
1749
    ('uu', '', '', '(Q|u)'),
1750
    ('ee', '', '', 'e'),
1751
    ('eu', '', '', '(Y|Yj)'),  # Dutch Y
1752
    ('aa', '', '', 'a'),
1753
    ('oo', '', '', 'o'),
1754
    ('oe', '', '', 'u'),
1755
    ('ij', '', '', 'ej'),
1756
    ('ui', '', '', '(Y|uj)'),
1757
    ('ei', '', '', '(ej|aj)'),  # Dutch ej
1758
    ('i', '', '[aou]', 'j'),
1759
    ('y', '', '[aeou]', 'j'),
1760
    ('i', '[aou]', '', 'j'),
1761
    ('y', '[aeou]', '', 'j'),
1762
    # LATIN ALPHABET
1763
    ('a', '', '', 'a'),
1764
    ('b', '', '', 'b'),
1765
    ('c', '', '', 'k'),
1766
    ('d', '', '', 'd'),
1767
    ('e', '', '', 'e'),
1768
    ('f', '', '', 'f'),
1769
    ('g', '', '', '(g|x)'),
1770
    ('h', '', '', 'h'),
1771
    ('i', '', '', '(i|Q)'),
1772
    ('j', '', '', 'j'),
1773
    ('k', '', '', 'k'),
1774
    ('l', '', '', 'l'),
1775
    ('m', '', '', 'm'),
1776
    ('n', '', '', 'n'),
1777
    ('o', '', '', 'o'),
1778
    ('p', '', '', 'p'),
1779
    ('q', '', '', 'k'),
1780
    ('r', '', '', 'r'),
1781
    ('s', '', '', 's'),
1782
    ('t', '', '', 't'),
1783
    ('u', '', '', '(u|Q)'),
1784
    ('v', '', '', 'v'),
1785
    ('w', '', '', '(w|v)'),
1786
    ('x', '', '', 'ks'),
1787
    ('y', '', '', 'i'),
1788
    ('z', '', '', 'z'),
1789
)
1790
1791
# gen/rulesenglish.php
1792
1793
# GENERAL
1794 1
_GEN_RULES_ENGLISH = (
1795
    # CONSONANTS
1796
    ('’', '', '', ''),  # O’Neill
1797
    ('\'', '', '', ''),  # O’Neill
1798
    ('mc', '^', '', 'mak'),  # McDonald
1799
    ('tz', '', '', 'ts'),  # Fitzgerald
1800
    ('tch', '', '', 'tS'),
1801
    ('ch', '', '', '(tS|x)'),
1802
    ('ck', '', '', 'k'),
1803
    ('cc', '', '[iey]', 'ks'),  # success, accent
1804
    ('c', '', 'c', ''),
1805
    ('c', '', '[iey]', 's'),  # circle
1806
    ('gh', '^', '', 'g'),  # ghost
1807
    ('gh', '', '', '(g|f|w)'),  # burgh | tough | bough
1808
    ('gn', '', '', '(gn|n)'),
1809
    ('g', '', '[iey]', '(g|dZ)'),  # get, gem, giant, gigabyte
1810
    # ("th","","","(6|8|t)"),
1811
    ('th', '', '', 't'),
1812
    ('kh', '', '', 'x'),
1813
    ('ph', '', '', 'f'),
1814
    ('sch', '', '', '(S|sk)'),
1815
    ('sh', '', '', 'S'),
1816
    ('who', '^', '', 'hu'),
1817
    ('wh', '^', '', 'w'),
1818
    ('h', '', '$', ''),  # hard to find an example that isn't in a name
1819
    ('h', '', '[^aeiou]', ''),  # hard to find an example that isn't in a name
1820
    ('h', '^', '', 'H'),
1821
    ('kn', '^', '', 'n'),  # knight
1822
    ('mb', '', '$', 'm'),
1823
    ('ng', '', '$', '(N|ng)'),
1824
    ('pn', '^', '', '(pn|n)'),
1825
    ('ps', '^', '', '(ps|s)'),
1826
    ('qu', '', '', 'kw'),
1827
    ('tia', '', '', '(So|Sa)'),
1828
    ('tio', '', '', 'So'),
1829
    ('wr', '^', '', 'r'),
1830
    ('x', '^', '', 'z'),
1831
    # VOWELS
1832
    ('y', '^', '', 'j'),
1833
    ('y', '^', '[aeiouy]', 'j'),
1834
    ('yi', '^', '', 'i'),
1835
    ('aue', '', '', 'aue'),
1836
    ('oue', '', '', '(aue|oue)'),
1837
    ('ai', '', '', '(aj|ej|e)'),  # rain | said
1838
    ('ay', '', '', '(aj|ej)'),
1839
    ('a', '', '[^aeiou]e', 'ej'),  # plane
1840
    ('ei', '', '', '(ej|aj|i)'),  # weigh | receive
1841
    ('ey', '', '', '(ej|aj|i)'),  # hey | barley
1842
    ('ear', '', '', 'ia'),  # tear
1843
    ('ea', '', '', '(i|e)'),  # reason | treasure
1844
    ('ee', '', '', 'i'),  # between
1845
    ('e', '', '[^aeiou]e', 'i'),  # meter
1846
    ('e', '', '$', '(|E)'),  # blame, badge
1847
    ('ie', '', '', 'i'),  # believe
1848
    ('i', '', '[^aeiou]e', 'aj'),  # five
1849
    ('oa', '', '', 'ou'),  # toad
1850
    ('oi', '', '', 'oj'),  # join
1851
    ('oo', '', '', 'u'),  # food
1852
    ('ou', '', '', '(u|ou)'),  # through | tough | could
1853
    ('oy', '', '', 'oj'),  # boy
1854
    ('o', '', '[^aeiou]e', 'ou'),  # rode
1855
    ('u', '', '[^aeiou]e', '(ju|u)'),  # cute | flute
1856
    (
1857
        'u',
1858
        '',
1859
        'r',
1860
        '(e|u)',
1861
    ),  # turn -- Morse disagrees, feels it should go to E  # noqa: E501
1862
    # LATIN ALPHABET
1863
    ('a', '', '', '(e|o|a)'),  # hat | call | part
1864
    ('b', '', '', 'b'),
1865
    ('c', '', '', 'k'),  # candy
1866
    ('d', '', '', 'd'),
1867
    ('e', '', '', 'E'),  # bed
1868
    ('f', '', '', 'f'),
1869
    ('g', '', '', 'g'),
1870
    ('h', '', '', 'h'),
1871
    ('i', '', '', 'I'),
1872
    ('j', '', '', 'dZ'),
1873
    ('k', '', '', 'k'),
1874
    ('l', '', '', 'l'),
1875
    ('m', '', '', 'm'),
1876
    ('n', '', '', 'n'),
1877
    ('o', '', '', '(o|a)'),  # hot
1878
    ('p', '', '', 'p'),
1879
    ('q', '', '', 'k'),
1880
    ('r', '', '', 'r'),
1881
    ('s', '', '', 's'),
1882
    ('t', '', '', 't'),
1883
    ('u', '', '', '(u|a)'),  # put
1884
    ('v', '', '', 'v'),
1885
    (
1886
        'w',
1887
        '',
1888
        '',
1889
        '(w|v)',
1890
    ),  # the variant "v" is for spellings coming from German/Polish  # noqa: E501
1891
    ('x', '', '', 'ks'),
1892
    ('y', '', '', 'i'),
1893
    ('z', '', '', 'z'),
1894
)
1895
1896
# gen/rulesfrench.php
1897
1898
# GENERAL
1899 1
_GEN_RULES_FRENCH = (
1900
    # CONSONANTS
1901
    ('lt', 'u', '$', '(lt|)'),  # Renault
1902
    ('c', 'n', '$', '(k|)'),  # Tronc
1903
    # ("f","","","(f|)"), # Clef
1904
    ('d', '', '$', '(t|)'),  # Durand
1905
    ('g', 'n', '$', '(k|)'),  # Gang
1906
    ('p', '', '$', '(p|)'),  # Trop, Champ
1907
    ('r', 'e', '$', '(r|)'),  # Barbier
1908
    ('t', '', '$', '(t|)'),  # Murat, Constant
1909
    ('z', '', '$', '(s|)'),
1910
    ('ds', '', '$', '(ds|)'),
1911
    ('ps', '', '$', '(ps|)'),  # Champs
1912
    ('rs', 'e', '$', '(rs|)'),
1913
    ('ts', '', '$', '(ts|)'),
1914
    ('s', '', '$', '(s|)'),  # Denis
1915
    ('x', 'u', '$', '(ks|)'),  # Arnoux
1916
    (
1917
        's',
1918
        '[aeéèêiou]',
1919
        '[^aeéèêiou]',
1920
        '(s|)',
1921
    ),  # Deschamps, Malesherbes, Groslot  # noqa: E501
1922
    ('t', '[aeéèêiou]', '[^aeéèêiou]', '(t|)'),  # Petitjean
1923
    ('kh', '', '', 'x'),  # foreign
1924
    ('ph', '', '', 'f'),
1925
    ('ç', '', '', 's'),
1926
    ('x', '', '', 'ks'),
1927
    ('ch', '', '', 'S'),
1928
    ('c', '', '[eiyéèê]', 's'),
1929
    ('gn', '', '', '(n|gn)'),
1930
    ('g', '', '[eiy]', 'Z'),
1931
    ('gue', '', '$', 'k'),
1932
    ('gu', '', '[eiy]', 'g'),
1933
    ('aill', '', 'e', 'aj'),  # non Jewish
1934
    ('ll', '', 'e', '(l|j)'),  # non Jewish
1935
    ('que', '', '$', 'k'),
1936
    ('qu', '', '', 'k'),
1937
    ('s', '[aeiouyéèê]', '[aeiouyéèê]', 'z'),
1938
    ('h', '[bdgt]', '', ''),  # translit from Arabic
1939
    ('m', '[aeiouy]', '[aeiouy]', 'm'),
1940
    ('m', '[aeiouy]', '', '(m|n)'),  # nasal
1941
    ('ou', '', '[aeio]', 'v'),
1942
    ('u', '', '[aeio]', 'v'),
1943
    # VOWELS
1944
    ('aue', '', '', 'aue'),
1945
    ('eau', '', '', 'o'),
1946
    ('au', '', '', '(o|au)'),  # non Jewish
1947
    ('ai', '', '', '(e|aj)'),  # [e] is non Jewish
1948
    ('ay', '', '', '(e|aj)'),  # [e] is non Jewish
1949
    ('é', '', '', 'e'),
1950
    ('ê', '', '', 'e'),
1951
    ('è', '', '', 'e'),
1952
    ('à', '', '', 'a'),
1953
    ('â', '', '', 'a'),
1954
    ('où', '', '', 'u'),
1955
    ('ou', '', '', 'u'),
1956
    ('oi', '', '', '(oj|va)'),  # [va] (actually "ua") is non Jewish
1957
    ('ei', '', '', '(aj|ej|e)'),  # [e] is non Jewish
1958
    ('ey', '', '', '(aj|ej|e)'),  # [e] non Jewish
1959
    ('eu', '', '', '(ej|Y)'),  # non Jewish
1960
    ('y', '[ou]', '', 'j'),
1961
    ('e', '', '$', '(e|)'),
1962
    ('i', '', '[aou]', 'j'),
1963
    ('y', '', '[aoeu]', 'j'),
1964
    # LATIN ALPHABET
1965
    ('a', '', '', 'a'),
1966
    ('b', '', '', 'b'),
1967
    ('c', '', '', 'k'),
1968
    ('d', '', '', 'd'),
1969
    ('e', '', '', 'e'),
1970
    ('f', '', '', 'f'),
1971
    ('g', '', '', 'g'),
1972
    ('h', '', '', 'h'),
1973
    ('i', '', '', 'i'),
1974
    ('j', '', '', 'Z'),
1975
    ('k', '', '', 'k'),
1976
    ('l', '', '', 'l'),
1977
    ('m', '', '', 'm'),
1978
    ('n', '', '', 'n'),
1979
    ('o', '', '', 'o'),
1980
    ('p', '', '', 'p'),
1981
    ('q', '', '', 'k'),
1982
    ('r', '', '', 'r'),
1983
    ('s', '', '', 's'),
1984
    ('t', '', '', 't'),
1985
    ('u', '', '', '(u|Q)'),
1986
    ('v', '', '', 'v'),
1987
    ('w', '', '', 'v'),
1988
    ('y', '', '', 'i'),
1989
    ('z', '', '', 'z'),
1990
)
1991
1992
# gen/rulesgerman.php
1993
1994
# GENERIC
1995 1
_GEN_RULES_GERMAN = (
1996
    # CONSONANTS
1997
    ('ewitsch', '', '$', 'evitS'),
1998
    ('owitsch', '', '$', 'ovitS'),
1999
    ('evitsch', '', '$', 'evitS'),
2000
    ('ovitsch', '', '$', 'ovitS'),
2001
    ('witsch', '', '$', 'vitS'),
2002
    ('vitsch', '', '$', 'vitS'),
2003
    ('ssch', '', '', 'S'),
2004
    ('chsch', '', '', 'xS'),
2005
    ('sch', '', '', 'S'),
2006
    ('ziu', '', '', 'tsu'),
2007
    ('zia', '', '', 'tsa'),
2008
    ('zio', '', '', 'tso'),
2009
    ('chs', '', '', 'ks'),
2010
    ('ch', '', '', 'x'),
2011
    ('ck', '', '', 'k'),
2012
    ('c', '', '[eiy]', 'ts'),
2013
    ('sp', '^', '', 'Sp'),
2014
    ('st', '^', '', 'St'),
2015
    ('ssp', '', '', '(Sp|sp)'),
2016
    ('sp', '', '', '(Sp|sp)'),
2017
    ('sst', '', '', '(St|st)'),
2018
    ('st', '', '', '(St|st)'),
2019
    ('pf', '', '', '(pf|p|f)'),
2020
    ('ph', '', '', '(ph|f)'),
2021
    ('qu', '', '', 'kv'),
2022
    ('ewitz', '', '$', '(evits|evitS)'),
2023
    ('ewiz', '', '$', '(evits|evitS)'),
2024
    ('evitz', '', '$', '(evits|evitS)'),
2025
    ('eviz', '', '$', '(evits|evitS)'),
2026
    ('owitz', '', '$', '(ovits|ovitS)'),
2027
    ('owiz', '', '$', '(ovits|ovitS)'),
2028
    ('ovitz', '', '$', '(ovits|ovitS)'),
2029
    ('oviz', '', '$', '(ovits|ovitS)'),
2030
    ('witz', '', '$', '(vits|vitS)'),
2031
    ('wiz', '', '$', '(vits|vitS)'),
2032
    ('vitz', '', '$', '(vits|vitS)'),
2033
    ('viz', '', '$', '(vits|vitS)'),
2034
    ('tz', '', '', 'ts'),
2035
    ('thal', '', '$', 'tal'),
2036
    ('th', '^', '', 't'),
2037
    ('th', '', '[äöüaeiou]', '(t|th)'),
2038
    ('th', '', '', 't'),
2039
    ('rh', '^', '', 'r'),
2040
    ('h', '[aeiouyäöü]', '', ''),
2041
    ('h', '^', '', 'H'),
2042
    ('ss', '', '', 's'),
2043
    ('s', '', '[äöüaeiouy]', '(z|s)'),
2044
    ('s', '[aeiouyäöüj]', '[aeiouyäöü]', 'z'),
2045
    ('ß', '', '', 's'),
2046
    # VOWELS
2047
    ('ij', '', '$', 'i'),
2048
    ('aue', '', '', 'aue'),
2049
    ('ue', '', '', 'Q'),
2050
    ('ae', '', '', 'Y'),
2051
    ('oe', '', '', 'Y'),
2052
    ('ü', '', '', 'Q'),
2053
    ('ä', '', '', '(Y|e)'),
2054
    ('ö', '', '', 'Y'),
2055
    ('ei', '', '', '(aj|ej)'),
2056
    ('ey', '', '', '(aj|ej)'),
2057
    ('eu', '', '', '(Yj|ej|aj|oj)'),
2058
    ('i', '[aou]', '', 'j'),
2059
    ('y', '[aou]', '', 'j'),
2060
    ('ie', '', '', 'I'),
2061
    ('i', '', '[aou]', 'j'),
2062
    ('y', '', '[aoeu]', 'j'),
2063
    # FOREIGN LETTERs
2064
    ('ñ', '', '', 'n'),
2065
    ('ã', '', '', 'a'),
2066
    ('ő', '', '', 'o'),
2067
    ('ű', '', '', 'u'),
2068
    ('ç', '', '', 's'),
2069
    # LATIN ALPHABET
2070
    ('a', '', '', 'A'),
2071
    ('b', '', '', 'b'),
2072
    ('c', '', '', 'k'),
2073
    ('d', '', '', 'd'),
2074
    ('e', '', '', 'E'),
2075
    ('f', '', '', 'f'),
2076
    ('g', '', '', 'g'),
2077
    ('h', '', '', 'h'),
2078
    ('i', '', '', 'I'),
2079
    ('j', '', '', 'j'),
2080
    ('k', '', '', 'k'),
2081
    ('l', '', '', 'l'),
2082
    ('m', '', '', 'm'),
2083
    ('n', '', '', 'n'),
2084
    ('o', '', '', 'O'),
2085
    ('p', '', '', 'p'),
2086
    ('q', '', '', 'k'),
2087
    ('r', '', '', 'r'),
2088
    ('s', '', '', 's'),
2089
    ('t', '', '', 't'),
2090
    ('u', '', '', 'U'),
2091
    ('v', '', '', '(f|v)'),
2092
    ('w', '', '', 'v'),
2093
    ('x', '', '', 'ks'),
2094
    ('y', '', '', 'i'),
2095
    ('z', '', '', 'ts'),
2096
)
2097
2098
# gen/rulesgreek.php
2099
2100 1
_GEN_RULES_GREEK = (
2101
    (
2102
        'αυ',
2103
        '',
2104
        '$',
2105
        'af',
2106
    ),  # "av" before vowels and voiced consonants, "af" elsewhere  # noqa: E501
2107
    ('αυ', '', '(κ|π|σ|τ|φ|θ|χ|ψ)', 'af'),
2108
    ('αυ', '', '', 'av'),
2109
    (
2110
        'ευ',
2111
        '',
2112
        '$',
2113
        'ef',
2114
    ),  # "ev" before vowels and voiced consonants, "ef" elsewhere  # noqa: E501
2115
    ('ευ', '', '(κ|π|σ|τ|φ|θ|χ|ψ)', 'ef'),
2116
    ('ευ', '', '', 'ev'),
2117
    (
2118
        'ηυ',
2119
        '',
2120
        '$',
2121
        'if',
2122
    ),  # "iv" before vowels and voiced consonants, "if" elsewhere  # noqa: E501
2123
    ('ηυ', '', '(κ|π|σ|τ|φ|θ|χ|ψ)', 'if'),
2124
    ('ηυ', '', '', 'iv'),
2125
    ('ου', '', '', 'u'),  # [u:]
2126
    ('αι', '', '', 'aj'),  # modern [e]
2127
    ('ει', '', '', 'ej'),  # modern [i]
2128
    ('οι', '', '', 'oj'),  # modern [i]
2129
    ('ωι', '', '', 'oj'),
2130
    ('ηι', '', '', 'ej'),
2131
    ('υι', '', '', 'i'),  # modern Greek "i"
2132
    ('γγ', '(ε|ι|η|α|ο|ω|υ)', '(ε|ι|η)', '(nj|j)'),
2133
    ('γγ', '', '(ε|ι|η)', 'j'),
2134
    ('γγ', '(ε|ι|η|α|ο|ω|υ)', '', '(ng|g)'),
2135
    ('γγ', '', '', 'g'),
2136
    ('γκ', '^', '', 'g'),
2137
    ('γκ', '(ε|ι|η|α|ο|ω|υ)', '(ε|ι|η)', '(nj|j)'),
2138
    ('γκ', '', '(ε|ι|η)', 'j'),
2139
    ('γκ', '(ε|ι|η|α|ο|ω|υ)', '', '(ng|g)'),
2140
    ('γκ', '', '', 'g'),
2141
    ('γι', '', '(α|ο|ω|υ)', 'j'),
2142
    ('γι', '', '', '(gi|i)'),
2143
    ('γε', '', '(α|ο|ω|υ)', 'j'),
2144
    ('γε', '', '', '(ge|je)'),
2145
    ('κζ', '', '', 'gz'),
2146
    ('τζ', '', '', 'dz'),
2147
    ('σ', '', '(β|γ|δ|μ|ν|ρ)', 'z'),
2148
    ('μβ', '', '', '(mb|b)'),
2149
    ('μπ', '^', '', 'b'),
2150
    ('μπ', '(ε|ι|η|α|ο|ω|υ)', '', 'mb'),
2151
    ('μπ', '', '', 'b'),  # after any consonant
2152
    ('ντ', '^', '', 'd'),
2153
    ('ντ', '(ε|ι|η|α|ο|ω|υ)', '', '(nd|nt)'),  # Greek is "nd"
2154
    ('ντ', '', '', '(nt|d)'),  # Greek is "d" after any consonant
2155
    ('ά', '', '', 'a'),
2156
    ('έ', '', '', 'e'),
2157
    ('ή', '', '', '(i|e)'),
2158
    ('ί', '', '', 'i'),
2159
    ('ό', '', '', 'o'),
2160
    ('ύ', '', '', '(Q|i|u)'),
2161
    ('ώ', '', '', 'o'),
2162
    ('ΰ', '', '', '(Q|i|u)'),
2163
    ('ϋ', '', '', '(Q|i|u)'),
2164
    ('ϊ', '', '', 'j'),
2165
    ('α', '', '', 'a'),
2166
    ('β', '', '', '(v|b)'),  # modern "v", old "b"
2167
    ('γ', '', '', 'g'),
2168
    ('δ', '', '', 'd'),  # modern like "th" in English "them", old "d"
2169
    ('ε', '', '', 'e'),
2170
    ('ζ', '', '', 'z'),
2171
    ('η', '', '', '(i|e)'),  # modern "i", old "e:"
2172
    ('ι', '', '', 'i'),
2173
    ('κ', '', '', 'k'),
2174
    ('λ', '', '', 'l'),
2175
    ('μ', '', '', 'm'),
2176
    ('ν', '', '', 'n'),
2177
    ('ξ', '', '', 'ks'),
2178
    ('ο', '', '', 'o'),
2179
    ('π', '', '', 'p'),
2180
    ('ρ', '', '', 'r'),
2181
    ('σ', '', '', 's'),
2182
    ('ς', '', '', 's'),
2183
    ('τ', '', '', 't'),
2184
    ('υ', '', '', '(Q|i|u)'),  # modern "i", old like German "ü"
2185
    ('φ', '', '', 'f'),
2186
    ('θ', '', '', 't'),  # old greek like "th" in English "theme"
2187
    ('χ', '', '', 'x'),
2188
    ('ψ', '', '', 'ps'),
2189
    ('ω', '', '', 'o'),
2190
)
2191
2192
# gen/rulesgreeklatin.php
2193
2194 1
_GEN_RULES_GREEKLATIN = (
2195
    ('au', '', '$', 'af'),
2196
    ('au', '', '[kpstfh]', 'af'),
2197
    ('au', '', '', 'av'),
2198
    ('eu', '', '$', 'ef'),
2199
    ('eu', '', '[kpstfh]', 'ef'),
2200
    ('eu', '', '', 'ev'),
2201
    ('ou', '', '', 'u'),
2202
    ('gge', '[aeiouy]', '', '(nje|je)'),  # aggelopoulos
2203
    ('ggi', '[aeiouy]', '[aou]', '(nj|j)'),
2204
    ('ggi', '[aeiouy]', '', '(ni|i)'),
2205
    ('gge', '', '', 'je'),
2206
    ('ggi', '', '', 'i'),
2207
    ('gg', '[aeiouy]', '', '(ng|g)'),
2208
    ('gg', '', '', 'g'),
2209
    ('gk', '^', '', 'g'),
2210
    ('gke', '[aeiouy]', '', '(nje|je)'),
2211
    ('gki', '[aeiouy]', '', '(ni|i)'),
2212
    ('gke', '', '', 'je'),
2213
    ('gki', '', '', 'i'),
2214
    ('gk', '[aeiouy]', '', '(ng|g)'),
2215
    ('gk', '', '', 'g'),
2216
    ('nghi', '', '[aouy]', 'Nj'),
2217
    ('nghi', '', '', '(Ngi|Ni)'),
2218
    ('nghe', '', '[aouy]', 'Nj'),
2219
    ('nghe', '', '', '(Nje|Nge)'),
2220
    ('ghi', '', '[aouy]', 'j'),
2221
    ('ghi', '', '', '(gi|i)'),
2222
    ('ghe', '', '[aouy]', 'j'),
2223
    ('ghe', '', '', '(je|ge)'),
2224
    ('ngh', '', '', 'Ng'),
2225
    ('gh', '', '', 'g'),
2226
    ('ngi', '', '[aouy]', 'Nj'),
2227
    ('ngi', '', '', '(Ngi|Ni)'),
2228
    ('nge', '', '[aouy]', 'Nj'),
2229
    ('nge', '', '', '(Nje|Nge)'),
2230
    ('gi', '', '[aouy]', 'j'),
2231
    ('gi', '', '', '(gi|i)'),  # what about Pantazis = Pantagis ???
2232
    ('ge', '', '[aouy]', 'j'),
2233
    ('ge', '', '', '(je|ge)'),
2234
    (
2235
        'ng',
2236
        '',
2237
        '',
2238
        'Ng',
2239
    ),  # fragakis = fraggakis = frangakis; angel = agel = aggel  # noqa: E501
2240
    ('i', '', '[aeou]', 'j'),
2241
    ('i', '[aeou]', '', 'j'),
2242
    ('y', '', '[aeou]', 'j'),
2243
    ('y', '[aeou]', '', 'j'),
2244
    ('yi', '', '[aeou]', 'j'),
2245
    ('yi', '', '', 'i'),
2246
    ('ch', '', '', 'x'),
2247
    ('kh', '', '', 'x'),
2248
    ('dh', '', '', 'd'),  # actually as "th" in English "that"
2249
    ('dj', '', '', 'dZ'),  # Turkish words
2250
    ('ph', '', '', 'f'),
2251
    ('th', '', '', 't'),
2252
    ('kz', '', '', 'gz'),
2253
    ('tz', '', '', 'dz'),
2254
    ('s', '', '[bgdmnr]', 'z'),
2255
    ('mb', '', '', '(mb|b)'),  # Liberis = Limperis = Limberis
2256
    ('mp', '^', '', 'b'),
2257
    ('mp', '[aeiouy]', '', 'mp'),
2258
    ('mp', '', '', 'b'),
2259
    ('nt', '^', '', 'd'),
2260
    ('nt', '[aeiouy]', '', '(nd|nt)'),  # Greek "nd"
2261
    ('nt', '', '', '(nt|d)'),  # Greek "d" after any consonant
2262
    ('á', '', '', 'a'),
2263
    ('é', '', '', 'e'),
2264
    ('í', '', '', 'i'),
2265
    ('ó', '', '', 'o'),
2266
    ('óu', '', '', 'u'),
2267
    ('ú', '', '', 'u'),
2268
    ('ý', '', '', '(i|Q|u)'),  # [ü]
2269
    ('a', '', '', 'a'),
2270
    ('b', '', '', '(b|v)'),  # beta: modern "v", old "b"
2271
    ('c', '', '', 'k'),
2272
    ('d', '', '', 'd'),  # modern like "th" in English "them", old "d"
2273
    ('e', '', '', 'e'),
2274
    ('f', '', '', 'f'),
2275
    ('g', '', '', 'g'),
2276
    ('h', '', '', 'x'),
2277
    ('i', '', '', 'i'),
2278
    (
2279
        'j',
2280
        '',
2281
        '',
2282
        '(j|Z)',
2283
    ),  # Panajotti = Panaiotti; Louijos = Louizos; Pantajis = Pantazis = Pantagis  # noqa: E501
2284
    ('k', '', '', 'k'),
2285
    ('l', '', '', 'l'),
2286
    ('m', '', '', 'm'),
2287
    ('n', '', '', 'n'),
2288
    ('ο', '', '', 'o'),
2289
    ('p', '', '', 'p'),
2290
    ('q', '', '', 'k'),  # foreign
2291
    ('r', '', '', 'r'),
2292
    ('s', '', '', 's'),
2293
    ('t', '', '', 't'),
2294
    ('u', '', '', 'u'),
2295
    ('v', '', '', 'v'),
2296
    ('w', '', '', 'v'),  # foreign
2297
    ('x', '', '', 'ks'),
2298
    ('y', '', '', '(i|Q|u)'),  # [ü]
2299
    ('z', '', '', 'z'),
2300
)
2301
2302
# gen/ruleshebrew.php
2303
2304
# General = Ashkenazic
2305 1
_GEN_RULES_HEBREW = (
2306
    ('אי', '', '', 'i'),
2307
    ('עי', '', '', 'i'),
2308
    ('עו', '', '', 'VV'),
2309
    ('או', '', '', 'VV'),
2310
    ('ג׳', '', '', 'Z'),
2311
    ('ד׳', '', '', 'dZ'),
2312
    ('א', '', '', 'L'),
2313
    ('ב', '', '', 'b'),
2314
    ('ג', '', '', 'g'),
2315
    ('ד', '', '', 'd'),
2316
    ('ה', '^', '', '1'),
2317
    ('ה', '', '$', '1'),
2318
    ('ה', '', '', ''),
2319
    ('וו', '', '', 'V'),
2320
    ('וי', '', '', 'WW'),
2321
    ('ו', '', '', 'W'),
2322
    ('ז', '', '', 'z'),
2323
    ('ח', '', '', 'X'),
2324
    ('ט', '', '', 'T'),
2325
    ('יי', '', '', 'i'),
2326
    ('י', '', '', 'i'),
2327
    ('ך', '', '', 'X'),
2328
    ('כ', '^', '', 'K'),
2329
    ('כ', '', '', 'k'),
2330
    ('ל', '', '', 'l'),
2331
    ('ם', '', '', 'm'),
2332
    ('מ', '', '', 'm'),
2333
    ('ן', '', '', 'n'),
2334
    ('נ', '', '', 'n'),
2335
    ('ס', '', '', 's'),
2336
    ('ע', '', '', 'L'),
2337
    ('ף', '', '', 'f'),
2338
    ('פ', '', '', 'f'),
2339
    ('ץ', '', '', 'C'),
2340
    ('צ', '', '', 'C'),
2341
    ('ק', '', '', 'K'),
2342
    ('ר', '', '', 'r'),
2343
    ('ש', '', '', 's'),
2344
    ('ת', '', '', 'TB'),  # only Ashkenazic
2345
)
2346
2347
# gen/ruleshungarian.php
2348
2349
# GENERAL
2350 1
_GEN_RULES_HUNGARIAN = (
2351
    # CONSONANTS
2352
    ('sz', '', '', 's'),
2353
    ('zs', '', '', 'Z'),
2354
    ('cs', '', '', 'tS'),
2355
    ('ay', '', '', '(oj|aj)'),
2356
    ('ai', '', '', '(oj|aj)'),
2357
    ('aj', '', '', '(oj|aj)'),
2358
    ('ei', '', '', '(aj|ej)'),  # German element
2359
    ('ey', '', '', '(aj|ej)'),  # German element
2360
    ('y', '[áo]', '', 'j'),
2361
    ('i', '[áo]', '', 'j'),
2362
    ('ee', '', '', '(ej|e)'),
2363
    ('ely', '', '', '(ej|eli)'),
2364
    ('ly', '', '', '(j|li)'),
2365
    ('gy', '', '[aeouáéóúüöőű]', 'dj'),
2366
    ('gy', '', '', '(d|gi)'),
2367
    ('ny', '', '[aeouáéóúüöőű]', 'nj'),
2368
    ('ny', '', '', '(n|ni)'),
2369
    ('ty', '', '[aeouáéóúüöőű]', 'tj'),
2370
    ('ty', '', '', '(t|ti)'),
2371
    ('qu', '', '', '(ku|kv)'),
2372
    ('h', '', '$', ''),
2373
    # SPECIAL VOWELS
2374
    ('á', '', '', 'a'),
2375
    ('é', '', '', 'e'),
2376
    ('í', '', '', 'i'),
2377
    ('ó', '', '', 'o'),
2378
    ('ú', '', '', 'u'),
2379
    ('ö', '', '', 'Y'),
2380
    ('ő', '', '', 'Y'),
2381
    ('ü', '', '', 'Q'),
2382
    ('ű', '', '', 'Q'),
2383
    # LATIN ALPHABET
2384
    ('a', '', '', 'a'),
2385
    ('b', '', '', 'b'),
2386
    ('c', '', '', 'ts'),
2387
    ('d', '', '', 'd'),
2388
    ('e', '', '', 'E'),
2389
    ('f', '', '', 'f'),
2390
    ('g', '', '', 'g'),
2391
    ('h', '', '', 'h'),
2392
    ('i', '', '', 'I'),
2393
    ('j', '', '', 'j'),
2394
    ('k', '', '', 'k'),
2395
    ('l', '', '', 'l'),
2396
    ('m', '', '', 'm'),
2397
    ('n', '', '', 'n'),
2398
    ('o', '', '', 'o'),
2399
    ('p', '', '', 'p'),
2400
    ('q', '', '', 'k'),
2401
    ('r', '', '', 'r'),
2402
    ('s', '', '', '(S|s)'),
2403
    ('t', '', '', 't'),
2404
    ('u', '', '', 'u'),
2405
    ('v', '', '', 'v'),
2406
    ('w', '', '', 'v'),
2407
    ('x', '', '', 'ks'),
2408
    ('y', '', '', 'i'),
2409
    ('z', '', '', 'z'),
2410
)
2411
2412
# gen/rulesitalian.php
2413
2414 1
_GEN_RULES_ITALIAN = (
2415
    ('kh', '', '', 'x'),  # foreign
2416
    ('gli', '', '', '(l|gli)'),
2417
    ('gn', '', '[aeou]', '(n|nj|gn)'),
2418
    ('gni', '', '', '(ni|gni)'),
2419
    ('gi', '', '[aeou]', 'dZ'),
2420
    ('gg', '', '[ei]', 'dZ'),
2421
    ('g', '', '[ei]', 'dZ'),
2422
    ('h', '[bdgt]', '', 'g'),  # gh is It; others from Arabic translit
2423
    ('h', '', '$', ''),  # foreign
2424
    ('ci', '', '[aeou]', 'tS'),
2425
    ('ch', '', '[ei]', 'k'),
2426
    ('sc', '', '[ei]', 'S'),
2427
    ('cc', '', '[ei]', 'tS'),
2428
    ('c', '', '[ei]', 'tS'),
2429
    ('s', '[aeiou]', '[aeiou]', 'z'),
2430
    ('i', '[aeou]', '', 'j'),
2431
    ('i', '', '[aeou]', 'j'),
2432
    ('y', '[aeou]', '', 'j'),  # foreign
2433
    ('y', '', '[aeou]', 'j'),  # foreign
2434
    ('qu', '', '', 'k'),
2435
    ('uo', '', '', '(vo|o)'),
2436
    ('u', '', '[aei]', 'v'),
2437
    ('è', '', '', 'e'),
2438
    ('é', '', '', 'e'),
2439
    ('ò', '', '', 'o'),
2440
    ('ó', '', '', 'o'),
2441
    # LATIN ALPHABET
2442
    ('a', '', '', 'a'),
2443
    ('b', '', '', 'b'),
2444
    ('c', '', '', 'k'),
2445
    ('d', '', '', 'd'),
2446
    ('e', '', '', 'e'),
2447
    ('f', '', '', 'f'),
2448
    ('g', '', '', 'g'),
2449
    ('h', '', '', 'h'),
2450
    ('i', '', '', 'i'),
2451
    ('j', '', '', '(Z|dZ|j)'),  # foreign
2452
    ('k', '', '', 'k'),
2453
    ('l', '', '', 'l'),
2454
    ('m', '', '', 'm'),
2455
    ('n', '', '', 'n'),
2456
    ('o', '', '', 'o'),
2457
    ('p', '', '', 'p'),
2458
    ('q', '', '', 'k'),
2459
    ('r', '', '', 'r'),
2460
    ('s', '', '', 's'),
2461
    ('t', '', '', 't'),
2462
    ('u', '', '', 'u'),
2463
    ('v', '', '', 'v'),
2464
    ('w', '', '', 'v'),  # foreign
2465
    ('x', '', '', 'ks'),  # foreign
2466
    ('y', '', '', 'i'),  # foreign
2467
    ('z', '', '', '(ts|dz)'),
2468
)
2469
2470
# gen/ruleslatvian.php
2471
2472
# GENERAL
2473 1
_GEN_RULES_LATVIAN = (
2474
    # CONSONANTS
2475
    ('č', '', '', 'tS'),
2476
    ('ģ', '', '', '(d|dj)'),
2477
    # ("ķ","","","(t|ti)"),
2478
    ('ķ', '', '', '(t|tj)'),
2479
    # ("ļ","","","lj"),
2480
    ('ļ', '', '', 'l'),
2481
    ('š', '', '', 'S'),
2482
    ('ņ', '', '', '(n|nj)'),
2483
    ('ž', '', '', 'Z'),
2484
    # SPECIAL VOWELS
2485
    ('ā', '', '', 'a'),
2486
    ('ē', '', '', 'e'),
2487
    ('ī', '', '', 'i'),
2488
    ('ū', '', '', 'u'),
2489
    # DIPHTONGS
2490
    # ("ai","","","(D|ai)"),
2491
    ('ai', '', '', 'aj'),
2492
    # ("ei","","","(D|ei)"),
2493
    ('ei', '', '', 'ej'),
2494
    ('io', '', '', 'jo'),
2495
    # ("iu","","","(D|iu)"),
2496
    ('iu', '', '', 'ju'),
2497
    # ("ie","","","(D|ie)"),
2498
    ('ie', '', '', 'je'),
2499
    # ("o","","","(D|uo)"),
2500
    ('o', '', '', 'o'),
2501
    # ("ui","","","(D|ui)"),
2502
    ('ui', '', '', 'uj'),
2503
    # LATIN ALPHABET
2504
    ('a', '', '', 'a'),
2505
    ('b', '', '', 'b'),
2506
    ('c', '', '', 'ts'),
2507
    ('d', '', '', 'd'),
2508
    ('e', '', '', 'E'),
2509
    ('f', '', '', 'f'),
2510
    ('g', '', '', 'g'),
2511
    ('h', '', '', 'h'),
2512
    ('i', '', '', 'I'),
2513
    ('j', '', '', 'j'),
2514
    ('k', '', '', 'k'),
2515
    ('l', '', '', 'l'),
2516
    ('m', '', '', 'm'),
2517
    ('n', '', '', 'n'),
2518
    ('p', '', '', 'p'),
2519
    ('r', '', '', 'r'),
2520
    ('s', '', '', 's'),
2521
    ('t', '', '', 't'),
2522
    ('u', '', '', 'u'),
2523
    ('v', '', '', 'v'),
2524
    ('z', '', '', 'z'),
2525
)
2526
2527
# gen/rulespolish.php
2528
2529
# GENERIC
2530 1
_GEN_RULES_POLISH = (
2531
    # CONVERTING FEMININE TO MASCULINE
2532
    ('ska', '', '$', 'ski'),
2533
    ('cka', '', '$', 'tski'),
2534
    ('lowa', '', '$', '(lova|lof|l|el)'),
2535
    ('kowa', '', '$', '(kova|kof|k|ek)'),
2536
    ('owa', '', '$', '(ova|of|)'),
2537
    ('lowna', '', '$', '(lovna|levna|l|el)'),
2538
    ('kowna', '', '$', '(kovna|k|ek)'),
2539
    ('owna', '', '$', '(ovna|)'),
2540
    ('lówna', '', '$', '(l|el)'),
2541
    ('kówna', '', '$', '(k|ek)'),
2542
    ('ówna', '', '$', ''),
2543
    ('a', '', '$', '(a|i)'),
2544
    # CONSONANTS
2545
    ('czy', '', '', 'tSi'),
2546
    ('cze', '', '[bcdgkpstwzż]', '(tSe|tSF)'),
2547
    ('ciewicz', '', '', '(tsevitS|tSevitS)'),
2548
    ('siewicz', '', '', '(sevitS|SevitS)'),
2549
    ('ziewicz', '', '', '(zevitS|ZevitS)'),
2550
    ('riewicz', '', '', 'rjevitS'),
2551
    ('diewicz', '', '', 'djevitS'),
2552
    ('tiewicz', '', '', 'tjevitS'),
2553
    ('iewicz', '', '', 'evitS'),
2554
    ('ewicz', '', '', 'evitS'),
2555
    ('owicz', '', '', 'ovitS'),
2556
    ('icz', '', '', 'itS'),
2557
    ('cz', '', '', 'tS'),
2558
    ('ch', '', '', 'x'),
2559
    ('cia', '', '[bcdgkpstwzż]', '(tSB|tsB)'),
2560
    ('cia', '', '', '(tSa|tsa)'),
2561
    ('cią', '', '[bp]', '(tSom|tsom)'),
2562
    ('cią', '', '', '(tSon|tson)'),
2563
    ('cię', '', '[bp]', '(tSem|tsem)'),
2564
    ('cię', '', '', '(tSen|tsen)'),
2565
    ('cie', '', '[bcdgkpstwzż]', '(tSF|tsF)'),
2566
    ('cie', '', '', '(tSe|tse)'),
2567
    ('cio', '', '', '(tSo|tso)'),
2568
    ('ciu', '', '', '(tSu|tsu)'),
2569
    ('ci', '', '', '(tSi|tsI)'),
2570
    ('ć', '', '', '(tS|ts)'),
2571
    ('ssz', '', '', 'S'),
2572
    ('sz', '', '', 'S'),
2573
    ('sia', '', '[bcdgkpstwzż]', '(SB|sB|sja)'),
2574
    ('sia', '', '', '(Sa|sja)'),
2575
    ('sią', '', '[bp]', '(Som|som)'),
2576
    ('sią', '', '', '(Son|son)'),
2577
    ('się', '', '[bp]', '(Sem|sem)'),
2578
    ('się', '', '', '(Sen|sen)'),
2579
    ('sie', '', '[bcdgkpstwzż]', '(SF|sF|se)'),
2580
    ('sie', '', '', '(Se|se)'),
2581
    ('sio', '', '', '(So|so)'),
2582
    ('siu', '', '', '(Su|sju)'),
2583
    ('si', '', '', '(Si|sI)'),
2584
    ('ś', '', '', '(S|s)'),
2585
    ('zia', '', '[bcdgkpstwzż]', '(ZB|zB|zja)'),
2586
    ('zia', '', '', '(Za|zja)'),
2587
    ('zią', '', '[bp]', '(Zom|zom)'),
2588
    ('zią', '', '', '(Zon|zon)'),
2589
    ('zię', '', '[bp]', '(Zem|zem)'),
2590
    ('zię', '', '', '(Zen|zen)'),
2591
    ('zie', '', '[bcdgkpstwzż]', '(ZF|zF)'),
2592
    ('zie', '', '', '(Ze|ze)'),
2593
    ('zio', '', '', '(Zo|zo)'),
2594
    ('ziu', '', '', '(Zu|zju)'),
2595
    ('zi', '', '', '(Zi|zI)'),
2596
    ('że', '', '[bcdgkpstwzż]', '(Ze|ZF)'),
2597
    ('że', '', '[bcdgkpstwzż]', '(Ze|ZF|ze|zF)'),
2598
    ('że', '', '', 'Ze'),
2599
    ('źe', '', '', '(Ze|ze)'),
2600
    ('ży', '', '', 'Zi'),
2601
    ('źi', '', '', '(Zi|zi)'),
2602
    ('ż', '', '', 'Z'),
2603
    ('ź', '', '', '(Z|z)'),
2604
    ('rze', 't', '', '(Se|re)'),
2605
    ('rze', '', '', '(Ze|re|rZe)'),
2606
    ('rzy', 't', '', '(Si|ri)'),
2607
    ('rzy', '', '', '(Zi|ri|rZi)'),
2608
    ('rz', 't', '', '(S|r)'),
2609
    ('rz', '', '', '(Z|r|rZ)'),
2610
    ('lio', '', '', '(lo|le)'),
2611
    ('ł', '', '', 'l'),
2612
    ('ń', '', '', 'n'),
2613
    ('qu', '', '', 'k'),
2614
    ('s', '', 's', ''),
2615
    # VOWELS
2616
    ('ó', '', '', '(u|o)'),
2617
    ('ą', '', '[bp]', 'om'),
2618
    ('ę', '', '[bp]', 'em'),
2619
    ('ą', '', '', 'on'),
2620
    ('ę', '', '', 'en'),
2621
    ('ije', '', '', 'je'),
2622
    ('yje', '', '', 'je'),
2623
    ('iie', '', '', 'je'),
2624
    ('yie', '', '', 'je'),
2625
    ('iye', '', '', 'je'),
2626
    ('yye', '', '', 'je'),
2627
    ('ij', '', '[aou]', 'j'),
2628
    ('yj', '', '[aou]', 'j'),
2629
    ('ii', '', '[aou]', 'j'),
2630
    ('yi', '', '[aou]', 'j'),
2631
    ('iy', '', '[aou]', 'j'),
2632
    ('yy', '', '[aou]', 'j'),
2633
    ('rie', '', '', 'rje'),
2634
    ('die', '', '', 'dje'),
2635
    ('tie', '', '', 'tje'),
2636
    ('ie', '', '[bcdgkpstwzż]', 'F'),
2637
    ('ie', '', '', 'e'),
2638
    ('aue', '', '', 'aue'),
2639
    ('au', '', '', 'au'),
2640
    ('ei', '', '', 'aj'),
2641
    ('ey', '', '', 'aj'),
2642
    ('ej', '', '', 'aj'),
2643
    ('ai', '', '', 'aj'),
2644
    ('ay', '', '', 'aj'),
2645
    ('aj', '', '', 'aj'),
2646
    ('i', '[aeou]', '', 'j'),
2647
    ('y', '[aeou]', '', 'j'),
2648
    ('i', '', '[aou]', 'j'),
2649
    ('y', '', '[aeou]', 'j'),
2650
    ('a', '', '[bcdgkpstwzż]', 'B'),
2651
    ('e', '', '[bcdgkpstwzż]', '(E|F)'),
2652
    ('o', '', '[bcćdgklłmnńrsśtwzźż]', 'P'),
2653
    # LATIN ALPHABET
2654
    ('a', '', '', 'a'),
2655
    ('b', '', '', 'b'),
2656
    ('c', '', '', 'ts'),
2657
    ('d', '', '', 'd'),
2658
    ('e', '', '', 'E'),
2659
    ('f', '', '', 'f'),
2660
    ('g', '', '', 'g'),
2661
    ('h', '', '', '(h|x)'),
2662
    ('i', '', '', 'I'),
2663
    ('j', '', '', 'j'),
2664
    ('k', '', '', 'k'),
2665
    ('l', '', '', 'l'),
2666
    ('m', '', '', 'm'),
2667
    ('n', '', '', 'n'),
2668
    ('o', '', '', 'o'),
2669
    ('p', '', '', 'p'),
2670
    ('q', '', '', 'k'),
2671
    ('r', '', '', 'r'),
2672
    ('s', '', '', 's'),
2673
    ('t', '', '', 't'),
2674
    ('u', '', '', 'u'),
2675
    ('v', '', '', 'v'),
2676
    ('w', '', '', 'v'),
2677
    ('x', '', '', 'ks'),
2678
    ('y', '', '', 'I'),
2679
    ('z', '', '', 'z'),
2680
)
2681
2682
# gen/rulesportuguese.php
2683
2684 1
_GEN_RULES_PORTUGUESE = (
2685
    ('kh', '', '', 'x'),  # foreign
2686
    ('ch', '', '', 'S'),
2687
    ('ss', '', '', 's'),
2688
    ('sc', '', '[ei]', 's'),
2689
    ('sç', '', '[aou]', 's'),
2690
    ('ç', '', '', 's'),
2691
    ('c', '', '[ei]', 's'),
2692
    # ("c","","[aou]","(k|C)"),
2693
    ('s', '^', '', 's'),
2694
    ('s', '[aáuiíoóeéêy]', '[aáuiíoóeéêy]', 'z'),
2695
    ('s', '', '[dglmnrv]', '(Z|S)'),  # Z is Brazil
2696
    ('z', '', '$', '(Z|s|S)'),  # s and S in Brazil
2697
    ('z', '', '[bdgv]', '(Z|z)'),  # Z in Brazil
2698
    ('z', '', '[ptckf]', '(s|S|z)'),  # s and S in Brazil
2699
    ('gu', '', '[eiu]', 'g'),
2700
    ('gu', '', '[ao]', 'gv'),
2701
    ('g', '', '[ei]', 'Z'),
2702
    ('qu', '', '[eiu]', 'k'),
2703
    ('qu', '', '[ao]', 'kv'),
2704
    ('uo', '', '', '(vo|o|u)'),
2705
    ('u', '', '[aei]', 'v'),
2706
    ('lh', '', '', 'l'),
2707
    ('nh', '', '', 'nj'),
2708
    ('h', '[bdgt]', '', ''),  # translit. from Arabic
2709
    ('h', '', '$', ''),  # foreign
2710
    ('ex', '', '[aáuiíoóeéêy]', '(ez|eS|eks)'),  # ez in Brazil
2711
    ('ex', '', '[cs]', 'e'),
2712
    ('y', '[aáuiíoóeéê]', '', 'j'),
2713
    ('y', '', '[aeiíou]', 'j'),
2714
    (
2715
        'm',
2716
        '',
2717
        '[bcdfglnprstv]',
2718
        '(m|n)',
2719
    ),  # maybe to add a rule for m/n before a consonant that disappears [preceeding vowel becomes nasalized]  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (123/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
2720
    (
2721
        'm',
2722
        '',
2723
        '$',
2724
        '(m|n)',
2725
    ),  # maybe to add a rule for final m/n that disappears [preceeding vowel becomes nasalized]  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (110/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
2726
    ('ão', '', '', '(au|an|on)'),
2727
    ('ãe', '', '', '(aj|an)'),
2728
    ('ãi', '', '', '(aj|an)'),
2729
    ('õe', '', '', '(oj|on)'),
2730
    ('i', '[aáuoóeéê]', '', 'j'),
2731
    ('i', '', '[aeou]', 'j'),
2732
    ('â', '', '', 'a'),
2733
    ('à', '', '', 'a'),
2734
    ('á', '', '', 'a'),
2735
    ('ã', '', '', '(a|an|on)'),
2736
    ('é', '', '', 'e'),
2737
    ('ê', '', '', 'e'),
2738
    ('í', '', '', 'i'),
2739
    ('ô', '', '', 'o'),
2740
    ('ó', '', '', 'o'),
2741
    ('õ', '', '', '(o|on)'),
2742
    ('ú', '', '', 'u'),
2743
    ('ü', '', '', 'u'),
2744
    ('aue', '', '', 'aue'),
2745
    # LATIN ALPHABET
2746
    ('a', '', '', 'a'),
2747
    ('b', '', '', 'b'),
2748
    ('c', '', '', 'k'),
2749
    ('d', '', '', 'd'),
2750
    ('e', '', '', '(e|i)'),
2751
    ('f', '', '', 'f'),
2752
    ('g', '', '', 'g'),
2753
    ('h', '', '', 'h'),
2754
    ('i', '', '', 'i'),
2755
    ('j', '', '', 'Z'),
2756
    ('k', '', '', 'k'),
2757
    ('l', '', '', 'l'),
2758
    ('m', '', '', 'm'),
2759
    ('n', '', '', 'n'),
2760
    ('o', '', '', '(o|u)'),
2761
    ('p', '', '', 'p'),
2762
    ('q', '', '', 'k'),
2763
    ('r', '', '', 'r'),
2764
    ('s', '', '', 'S'),
2765
    ('t', '', '', 't'),
2766
    ('u', '', '', 'u'),
2767
    ('v', '', '', 'v'),
2768
    ('w', '', '', 'v'),
2769
    ('x', '', '', '(S|ks)'),
2770
    ('y', '', '', 'i'),
2771
    ('z', '', '', 'z'),
2772
)
2773
2774
# gen/rulesromanian.php
2775
2776
# GENERAL
2777 1
_GEN_RULES_ROMANIAN = (
2778
    ('ce', '', '', 'tSe'),
2779
    ('ci', '', '', '(tSi|tS)'),
2780
    ('ch', '', '[ei]', 'k'),
2781
    ('ch', '', '', 'x'),  # foreign
2782
    ('gi', '', '', '(dZi|dZ)'),
2783
    ('g', '', '[ei]', 'dZ'),
2784
    ('gh', '', '', 'g'),
2785
    ('i', '[aeou]', '', 'j'),
2786
    ('i', '', '[aeou]', 'j'),
2787
    ('ţ', '', '', 'ts'),
2788
    ('ş', '', '', 'S'),
2789
    ('qu', '', '', 'k'),
2790
    ('î', '', '', 'i'),
2791
    ('ea', '', '', 'ja'),
2792
    ('ă', '', '', '(e|a)'),
2793
    ('aue', '', '', 'aue'),
2794
    # LATIN ALPHABET
2795
    ('a', '', '', 'a'),
2796
    ('b', '', '', 'b'),
2797
    ('c', '', '', 'k'),
2798
    ('d', '', '', 'd'),
2799
    ('e', '', '', 'E'),
2800
    ('f', '', '', 'f'),
2801
    ('g', '', '', 'g'),
2802
    ('h', '', '', '(x|h)'),
2803
    ('i', '', '', 'I'),
2804
    ('j', '', '', 'Z'),
2805
    ('k', '', '', 'k'),
2806
    ('l', '', '', 'l'),
2807
    ('m', '', '', 'm'),
2808
    ('n', '', '', 'n'),
2809
    ('o', '', '', 'o'),
2810
    ('p', '', '', 'p'),
2811
    ('q', '', '', 'k'),
2812
    ('r', '', '', 'r'),
2813
    ('s', '', '', 's'),
2814
    ('t', '', '', 't'),
2815
    ('u', '', '', 'u'),
2816
    ('v', '', '', 'v'),
2817
    ('w', '', '', 'v'),
2818
    ('x', '', '', 'ks'),
2819
    ('y', '', '', 'i'),
2820
    ('z', '', '', 'z'),
2821
)
2822
2823
# gen/rulesrussian.php
2824
2825
# GENERAL
2826 1
_GEN_RULES_RUSSIAN = (
2827
    # CONVERTING FEMININE TO MASCULINE
2828
    ('yna', '', '$', '(in|ina)'),
2829
    ('ina', '', '$', '(in|ina)'),
2830
    ('liova', '', '$', '(lof|lef)'),
2831
    ('lova', '', '$', '(lof|lef|lova)'),
2832
    ('ova', '', '$', '(of|ova)'),
2833
    ('eva', '', '$', '(ef|ova)'),
2834
    ('aia', '', '$', '(aja|i)'),
2835
    ('aja', '', '$', '(aja|i)'),
2836
    ('aya', '', '$', '(aja|i)'),
2837
    # SPECIAL CONSONANTS
2838
    ('tsya', '', '', 'tsa'),
2839
    ('tsyu', '', '', 'tsu'),
2840
    ('tsia', '', '', 'tsa'),
2841
    ('tsie', '', '', 'tse'),
2842
    ('tsio', '', '', 'tso'),
2843
    ('tsye', '', '', 'tse'),
2844
    ('tsyo', '', '', 'tso'),
2845
    ('tsiu', '', '', 'tsu'),
2846
    ('sie', '', '', 'se'),
2847
    ('sio', '', '', 'so'),
2848
    ('zie', '', '', 'ze'),
2849
    ('zio', '', '', 'zo'),
2850
    ('sye', '', '', 'se'),
2851
    ('syo', '', '', 'so'),
2852
    ('zye', '', '', 'ze'),
2853
    ('zyo', '', '', 'zo'),
2854
    ('ger', '', '$', 'ger'),
2855
    ('gen', '', '$', 'gen'),
2856
    ('gin', '', '$', 'gin'),
2857
    ('gg', '', '', 'g'),
2858
    ('g', '[jaeoiuy]', '[aeoiu]', 'g'),
2859
    ('g', '', '[aeoiu]', '(g|h)'),
2860
    ('kh', '', '', 'x'),
2861
    ('ch', '', '', '(tS|x)'),
2862
    ('sch', '', '', '(StS|S)'),
2863
    ('ssh', '', '', 'S'),
2864
    ('sh', '', '', 'S'),
2865
    ('zh', '', '', 'Z'),
2866
    ('tz', '', '$', 'ts'),
2867
    ('tz', '', '', '(ts|tz)'),
2868
    ('c', '', '[iey]', 's'),
2869
    ('qu', '', '', '(kv|k)'),
2870
    ('s', '', 's', ''),
2871
    # SPECIAL VOWELS
2872
    ('lya', '', '', 'la'),
2873
    ('lyu', '', '', 'lu'),
2874
    ('lia', '', '', 'la'),  # not in DJSRE
2875
    ('liu', '', '', 'lu'),  # not in DJSRE
2876
    ('lja', '', '', 'la'),  # not in DJSRE
2877
    ('lju', '', '', 'lu'),  # not in DJSRE
2878
    ('le', '', '', '(lo|lE)'),  # not in DJSRE
2879
    ('lyo', '', '', '(lo|le)'),  # not in DJSRE
2880
    ('lio', '', '', '(lo|le)'),
2881
    ('ije', '', '', 'je'),
2882
    ('ie', '', '', 'je'),
2883
    ('iye', '', '', 'je'),
2884
    ('iie', '', '', 'je'),
2885
    ('yje', '', '', 'je'),
2886
    ('ye', '', '', 'je'),
2887
    ('yye', '', '', 'je'),
2888
    ('yie', '', '', 'je'),
2889
    ('ij', '', '[aou]', 'j'),
2890
    ('iy', '', '[aou]', 'j'),
2891
    ('ii', '', '[aou]', 'j'),
2892
    ('yj', '', '[aou]', 'j'),
2893
    ('yy', '', '[aou]', 'j'),
2894
    ('yi', '', '[aou]', 'j'),
2895
    ('io', '', '', '(jo|e)'),
2896
    ('i', '', '[au]', 'j'),
2897
    ('i', '[aeou]', '', 'j'),
2898
    ('yo', '', '', '(jo|e)'),
2899
    ('y', '', '[au]', 'j'),
2900
    ('y', '[aeiou]', '', 'j'),
2901
    ('ii', '', '$', 'i'),
2902
    ('iy', '', '$', 'i'),
2903
    ('yy', '', '$', 'i'),
2904
    ('yi', '', '$', 'i'),
2905
    ('yj', '', '$', 'i'),
2906
    ('ij', '', '$', 'i'),
2907
    ('e', '^', '', '(je|E)'),
2908
    ('ee', '', '', '(aje|i)'),
2909
    ('e', '[aou]', '', 'je'),
2910
    ('oo', '', '', '(oo|u)'),
2911
    ('\'', '', '', ''),
2912
    ('"', '', '', ''),
2913
    ('aue', '', '', 'aue'),
2914
    # LATIN ALPHABET
2915
    ('a', '', '', 'a'),
2916
    ('b', '', '', 'b'),
2917
    ('c', '', '', 'k'),
2918
    ('d', '', '', 'd'),
2919
    ('e', '', '', 'E'),
2920
    ('f', '', '', 'f'),
2921
    ('g', '', '', 'g'),
2922
    ('h', '', '', 'h'),
2923
    ('i', '', '', 'I'),
2924
    ('j', '', '', 'j'),
2925
    ('k', '', '', 'k'),
2926
    ('l', '', '', 'l'),
2927
    ('m', '', '', 'm'),
2928
    ('n', '', '', 'n'),
2929
    ('o', '', '', 'o'),
2930
    ('p', '', '', 'p'),
2931
    ('q', '', '', 'k'),
2932
    ('r', '', '', 'r'),
2933
    ('s', '', '', 's'),
2934
    ('t', '', '', 't'),
2935
    ('u', '', '', 'u'),
2936
    ('v', '', '', 'v'),
2937
    ('w', '', '', 'v'),
2938
    ('x', '', '', 'ks'),
2939
    ('y', '', '', 'I'),
2940
    ('z', '', '', 'z'),
2941
)
2942
2943
# gen/rulesspanish.php
2944
2945
# GENERAL
2946 1
_GEN_RULES_SPANISH = (
2947
    # Includes both Spanish (Castillian) & Catalan
2948
    # CONSONANTS
2949
    ('ñ', '', '', '(n|nj)'),
2950
    ('ny', '', '', 'nj'),  # Catalan
2951
    ('ç', '', '', 's'),  # Catalan
2952
    ('ig', '[aeiou]', '', '(tS|ig)'),  # tS is Catalan
2953
    ('ix', '[aeiou]', '', 'S'),  # Catalan
2954
    ('tx', '', '', 'tS'),  # Catalan
2955
    ('tj', '', '$', 'tS'),  # Catalan
2956
    ('tj', '', '', 'dZ'),  # Catalan
2957
    ('tg', '', '', '(tg|dZ)'),  # dZ is Catalan
2958
    ('ch', '', '', '(tS|dZ)'),  # dZ is typical for Argentina
2959
    ('bh', '', '', 'b'),  # translit. from Arabic
2960
    ('h', '[dgt]', '', ''),  # translit. from Arabic
2961
    ('h', '', '$', ''),  # foreign
2962
    # ("ll","","","(l|Z)"), # Z is typical for Argentina, only Ashkenazic
2963
    ('m', '', '[bpvf]', '(m|n)'),
2964
    ('c', '', '[ei]', 's'),
2965
    # ("c","","[aou]","(k|C)"),
2966
    ('gu', '', '[ei]', '(g|gv)'),  # "gv" because "u" can actually be "ü"
2967
    ('g', '', '[ei]', '(x|g|dZ)'),  # "g" only for foreign words; dZ is Catalan
2968
    ('qu', '', '', 'k'),
2969
    ('uo', '', '', '(vo|o)'),
2970
    ('u', '', '[aei]', 'v'),
2971
    # SPECIAL VOWELS
2972
    ('ü', '', '', 'v'),
2973
    ('á', '', '', 'a'),
2974
    ('é', '', '', 'e'),
2975
    ('í', '', '', 'i'),
2976
    ('ó', '', '', 'o'),
2977
    ('ú', '', '', 'u'),
2978
    ('à', '', '', 'a'),  # Catalan
2979
    ('è', '', '', 'e'),  # Catalan
2980
    ('ò', '', '', 'o'),  # Catalan
2981
    # LATIN ALPHABET
2982
    ('a', '', '', 'a'),
2983
    ('b', '', '', 'B'),
2984
    ('c', '', '', 'k'),
2985
    ('d', '', '', 'd'),
2986
    ('e', '', '', 'e'),
2987
    ('f', '', '', 'f'),
2988
    ('g', '', '', 'g'),
2989
    ('h', '', '', 'h'),
2990
    ('i', '', '', 'i'),
2991
    ('j', '', '', '(x|Z)'),  # Z is Catalan
2992
    ('k', '', '', 'k'),
2993
    ('l', '', '', 'l'),
2994
    ('m', '', '', 'm'),
2995
    ('n', '', '', 'n'),
2996
    ('o', '', '', 'o'),
2997
    ('p', '', '', 'p'),
2998
    ('q', '', '', 'k'),
2999
    ('r', '', '', 'r'),
3000
    ('s', '', '', 's'),
3001
    ('t', '', '', 't'),
3002
    ('u', '', '', 'u'),
3003
    ('v', '', '', 'V'),
3004
    ('w', '', '', 'v'),  # foreign words
3005
    ('x', '', '', '(ks|gz|S)'),  # ks is Spanish, all are Catalan
3006
    ('y', '', '', '(i|j)'),
3007
    (
3008
        'z',
3009
        '',
3010
        '',
3011
        '(z|s)',
3012
    ),  # as "c" befoire "e" or "i", in Spain it is like unvoiced English "th"  # noqa: E501
3013
)
3014
3015
# gen/rulesturkish.php
3016
3017 1
_GEN_RULES_TURKISH = (
3018
    ('ç', '', '', 'tS'),
3019
    ('ğ', '', '', ''),  # to show that previous vowel is long
3020
    ('ş', '', '', 'S'),
3021
    ('ü', '', '', 'Q'),
3022
    ('ö', '', '', 'Y'),
3023
    ('ı', '', '', '(e|i|)'),  # as "e" in English "label"
3024
    ('a', '', '', 'a'),
3025
    ('b', '', '', 'b'),
3026
    ('c', '', '', 'dZ'),
3027
    ('d', '', '', 'd'),
3028
    ('e', '', '', 'e'),
3029
    ('f', '', '', 'f'),
3030
    ('g', '', '', 'g'),
3031
    ('h', '', '', 'h'),
3032
    ('i', '', '', 'i'),
3033
    ('j', '', '', 'Z'),
3034
    ('k', '', '', 'k'),
3035
    ('l', '', '', 'l'),
3036
    ('m', '', '', 'm'),
3037
    ('n', '', '', 'n'),
3038
    ('o', '', '', 'o'),
3039
    ('p', '', '', 'p'),
3040
    ('q', '', '', 'k'),  # foreign words
3041
    ('r', '', '', 'r'),
3042
    ('s', '', '', 's'),
3043
    ('t', '', '', 't'),
3044
    ('u', '', '', 'u'),
3045
    ('v', '', '', 'v'),
3046
    ('w', '', '', 'v'),  # foreign words
3047
    ('x', '', '', 'ks'),  # foreign words
3048
    ('y', '', '', 'j'),
3049
    ('z', '', '', 'z'),
3050
)
3051
3052
# sep/approxany.php
3053
3054
# SEPHARDIC
3055 1
_SEP_APPROX_ANY = (('E', '', '', ''),)  # Final French "e"
3056
3057
# sep/approxcommon.php
3058
# Sephardic
3059
3060 1
_SEP_APPROX_COMMON = (
3061
    ('bens', '^', '', '(binz|s)'),
3062
    ('benS', '^', '', '(binz|s)'),
3063
    ('ben', '^', '', '(bin|)'),
3064
    ('abens', '^', '', '(abinz|binz|s)'),
3065
    ('abenS', '^', '', '(abinz|binz|s)'),
3066
    ('aben', '^', '', '(abin|bin|)'),
3067
    ('els', '^', '', '(ilz|alz|s)'),
3068
    ('elS', '^', '', '(ilz|alz|s)'),
3069
    ('el', '^', '', '(il|al|)'),
3070
    ('als', '^', '', '(alz|s)'),
3071
    ('alS', '^', '', '(alz|s)'),
3072
    ('al', '^', '', '(al|)'),
3073
    # ("dels", "^", "", "(dilz|s)"),
3074
    # ("delS", "^", "", "(dilz|s)"),
3075
    ('del', '^', '', '(dil|)'),
3076
    ('dela', '^', '', '(dila|)'),
3077
    # ("delo", "^", "", "(dila|)"),
3078
    ('da', '^', '', '(da|)'),
3079
    ('de', '^', '', '(di|)'),
3080
    # ("des", "^", "", "(dis|dAs|)"),
3081
    # ("di", "^", "", "(di|)"),
3082
    # ("dos", "^", "", "(das|dus|)"),
3083
    ('oa', '', '', '(va|a|D)'),
3084
    ('oe', '', '', '(vi|D)'),
3085
    ('ae', '', '', 'D'),
3086
    # ("s", "", "$", "(s|)"), # Attia(s)
3087
    # ("C", "", "", "s"),  # "c" could actually be "ç"
3088
    ('n', '', '[bp]', 'm'),
3089
    (
3090
        'h',
3091
        '',
3092
        '',
3093
        '(|h|f)',
3094
    ),  # sound "h" (absent) can be expressed via /x/, Cojab in Spanish = Kohab ; Hakim = Fakim  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (109/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
3095
    ('x', '', '', 'h'),
3096
    # DIPHTHONGS ARE APPROXIMATELY equivalent
3097
    ('aja', '^', '', '(Da|ia)'),
3098
    ('aje', '^', '', '(Di|Da|i|ia)'),
3099
    ('aji', '^', '', '(Di|i)'),
3100
    ('ajo', '^', '', '(Du|Da|iu|ia)'),
3101
    ('aju', '^', '', '(Du|iu)'),
3102
    ('aj', '', '', 'D'),
3103
    ('ej', '', '', 'D'),
3104
    ('oj', '', '', 'D'),
3105
    ('uj', '', '', 'D'),
3106
    ('au', '', '', 'D'),
3107
    ('eu', '', '', 'D'),
3108
    ('ou', '', '', 'D'),
3109
    ('a', '^', '', '(a|)'),  # Arabic
3110
    ('ja', '^', '', 'ia'),
3111
    ('je', '^', '', 'i'),
3112
    ('jo', '^', '', '(iu|ia)'),
3113
    ('ju', '^', '', 'iu'),
3114
    ('ja', '', '', 'a'),
3115
    ('je', '', '', 'i'),
3116
    ('ji', '', '', 'i'),
3117
    ('jo', '', '', 'u'),
3118
    ('ju', '', '', 'u'),
3119
    ('j', '', '', 'i'),
3120
    # CONSONANTS {z & Z & dZ; s & S} are approximately interchangeable
3121
    ('s', '', '[rmnl]', 'z'),
3122
    ('S', '', '[rmnl]', 'z'),
3123
    ('s', '[rmnl]', '', 'z'),
3124
    ('S', '[rmnl]', '', 'z'),
3125
    ('dS', '', '$', 'S'),
3126
    ('dZ', '', '$', 'S'),
3127
    ('Z', '', '$', 'S'),
3128
    ('S', '', '$', '(S|s)'),
3129
    ('z', '', '$', '(S|s)'),
3130
    ('S', '', '', 's'),
3131
    ('dZ', '', '', 'z'),
3132
    ('Z', '', '', 'z'),
3133
    ('i', '', '$', '(i|)'),  # often in Arabic
3134
    ('e', '', '', 'i'),
3135
    ('o', '', '$', '(a|u)'),
3136
    ('o', '', '', 'u'),
3137
    # special character to deal correctly in Hebrew match
3138
    ('B', '', '', 'b'),
3139
    ('V', '', '', 'v'),
3140
    # Arabic
3141
    ('p', '^', '', 'b'),
3142
)
3143
3144
# sep/approxfrench.php
3145 1
_SEP_APPROX_FRENCH = ()
3146
3147
# sep/approxhebrew.php
3148 1
_SEP_APPROX_HEBREW = ()
3149
3150
# sep/approxitalian.php
3151
3152
# this file uses the same rules as approxfrench.php
3153
3154
# sep/approxportuguese.php
3155
# this file uses the same rules as approxfrench.php
3156
3157
# sep/approxspanish.php
3158
# this file uses the same rules as approxfrench.php
3159
3160
# sep/exactany.php
3161 1
_SEP_EXACT_ANY = (('E', '', '', 'e'),)  # final French "e"
3162
3163
# sep/exactapproxcommon.php
3164
# Sephardic
3165 1
_SEP_EXACT_APPROX_COMMON = (
3166
    ('h', '', '$', ''),
3167
    # VOICED - UNVOICED CONSONANTS
3168
    ('b', '', '[fktSs]', 'p'),
3169
    ('b', '', 'p', ''),
3170
    ('b', '', '$', 'p'),
3171
    ('p', '', '[vgdZz]', 'b'),
3172
    ('p', '', 'b', ''),
3173
    ('v', '', '[pktSs]', 'f'),
3174
    ('v', '', 'f', ''),
3175
    ('v', '', '$', 'f'),
3176
    ('f', '', '[vbgdZz]', 'v'),
3177
    ('f', '', 'v', ''),
3178
    ('g', '', '[pftSs]', 'k'),
3179
    ('g', '', 'k', ''),
3180
    ('g', '', '$', 'k'),
3181
    ('k', '', '[vbdZz]', 'g'),
3182
    ('k', '', 'g', ''),
3183
    ('d', '', '[pfkSs]', 't'),
3184
    ('d', '', 't', ''),
3185
    ('d', '', '$', 't'),
3186
    ('t', '', '[vbgZz]', 'd'),
3187
    ('t', '', 'd', ''),
3188
    ('s', '', 'dZ', ''),
3189
    ('s', '', 'tS', ''),
3190
    ('z', '', '[pfkSt]', 's'),
3191
    ('z', '', '[sSzZ]', ''),
3192
    ('s', '', '[sSzZ]', ''),
3193
    ('Z', '', '[sSzZ]', ''),
3194
    ('S', '', '[sSzZ]', ''),
3195
    # SIMPLIFICATION OF CONSONANT CLUSTERS
3196
    ('nm', '', '', 'm'),
3197
    # DOUBLE --> SINGLE
3198
    ('ji', '^', '', 'i'),
3199
    ('a', '', 'a', ''),
3200
    ('b', '', 'b', ''),
3201
    ('d', '', 'd', ''),
3202
    ('e', '', 'e', ''),
3203
    ('f', '', 'f', ''),
3204
    ('g', '', 'g', ''),
3205
    ('i', '', 'i', ''),
3206
    ('k', '', 'k', ''),
3207
    ('l', '', 'l', ''),
3208
    ('m', '', 'm', ''),
3209
    ('n', '', 'n', ''),
3210
    ('o', '', 'o', ''),
3211
    ('p', '', 'p', ''),
3212
    ('r', '', 'r', ''),
3213
    ('t', '', 't', ''),
3214
    ('u', '', 'u', ''),
3215
    ('v', '', 'v', ''),
3216
    ('z', '', 'z', '')
3217
    # do not put name of file here since it always gets merged into another file  # noqa: E501
3218
)
3219
3220
# sep/exactcommon.php
3221
# Sephardic
3222
3223 1
_SEP_EXACT_COMMON = (
3224
    ('h', '', '', ''),
3225
    # ("C","","","k"),  # c that can actually be ç
3226
    # VOICED - UNVOICED CONSONANTS
3227
    ('s', '[^t]', '[bgZd]', 'z'),
3228
    ('Z', '', '[pfkst]', 'S'),
3229
    ('Z', '', '$', 'S'),
3230
    ('S', '', '[bgzd]', 'Z'),
3231
    ('z', '', '$', 's'),
3232
    # special character to deal correctly in Hebrew match
3233
    ('B', '', '', 'b'),
3234
    ('V', '', '', 'v'),
3235
)
3236
3237
# sep/exactfrench.php
3238
# Sephardic
3239 1
_SEP_EXACT_FRENCH = ()
3240
3241
# sep/exacthebrew.php
3242 1
_SEP_EXACT_HEBREW = ()
3243
3244
# sep/exactitalian.php
3245
# Sephardic
3246 1
_SEP_EXACT_ITALIAN = ()
3247
3248
# sep/exactportuguese.php
3249
# Sephardic
3250 1
_SEP_EXACT_PORTUGUESE = ()
3251
3252
# sep/exactspanish.php
3253
# Sephardic
3254 1
_SEP_EXACT_SPANISH = ()
3255
3256
# sep/hebrewcommon.php
3257
# Sephardic
3258
3259 1
_SEP_HEBREW_COMMON = (
3260
    ('E', '', '', ''),  # final French "e": only in Sephardic
3261
    ('ts', '', '', 'C'),  # for not confusion Gutes [=guts] and Guts [=guc]
3262
    ('tS', '', '', 'C'),  # same reason
3263
    ('S', '', '', 's'),
3264
    ('p', '', '', 'f'),
3265
    ('b', '^', '', 'b'),
3266
    ('b', '', '', '(b|v)'),
3267
    ('ja', '', '', 'i'),
3268
    ('je', '', '', 'i'),
3269
    ('aj', '', '', 'i'),
3270
    ('j', '', '', 'i'),
3271
    ('a', '^', '', '1'),
3272
    ('e', '^', '', '1'),
3273
    ('a', '', '$', '1'),
3274
    ('e', '', '$', '1'),
3275
    ('a', '', '', ''),
3276
    ('e', '', '', ''),
3277
    ('oj', '^', '', '(u|vi)'),
3278
    ('uj', '^', '', '(u|vi)'),
3279
    ('oj', '', '', 'u'),
3280
    ('uj', '', '', 'u'),
3281
    ('ou', '^', '', '(u|v|1)'),
3282
    ('o', '^', '', '(u|v|1)'),
3283
    ('u', '^', '', '(u|v|1)'),
3284
    ('o', '', '$', '(u|1)'),
3285
    ('u', '', '$', '(u|1)'),
3286
    ('ou', '', '', 'u'),
3287
    ('o', '', '', 'u'),
3288
    ('VV', '', '', 'u'),  # alef/ayin + vov from ruleshebrew
3289
    ('L', '^', '', '1'),  # alef/ayin from  ruleshebrew
3290
    ('L', '', '$', '1'),  # alef/ayin from  ruleshebrew
3291
    ('L', '', '', ''),  # alef/ayin from  ruleshebrew
3292
    ('WW', '^', '', '(vi|u)'),  # vav-yod from  ruleshebrew
3293
    ('WW', '', '', 'u'),  # vav-yod from  ruleshebrew
3294
    ('W', '^', '', '(u|v)'),  # vav from  ruleshebrew
3295
    ('W', '', '', 'u'),  # vav from  ruleshebrew
3296
    # ("g","","","(g|Z)"),
3297
    # ("z","","","(z|Z)"),
3298
    # ("d","","","(d|dZ)"),
3299
    ('T', '', '', 't'),  # tet from  ruleshebrew
3300
    # ("k","","","(k|x)"),
3301
    # ("x","","","(k|x)"),
3302
    ('K', '', '', 'k'),  # kof and initial kaf from ruleshebrew
3303
    ('X', '', '', 'x'),  # khet and final kaf from ruleshebrew
3304
    # special for Spanish initial B/V
3305
    ('B', '', '', 'v'),
3306
    ('V', '', '', 'b'),
3307
    ('H', '^', '', '(x|1)'),
3308
    ('H', '', '$', '(x|1)'),
3309
    ('H', '', '', '(x|)'),
3310
    ('h', '^', '', '1'),
3311
    ('h', '', '', ''),
3312
)
3313
3314
# sep/lang.php
3315
# SEPHARDIC
3316
3317 1
_SEP_LANGUAGE_RULES = (
3318
    # 1. following are rules to accept the language
3319
    # 1.1 Special letter combinations
3320
    ('eau', 64, True),
3321
    ('ou', 64, True),
3322
    ('gni', 4160, True),
3323
    ('tx', 262144, True),
3324
    ('tj', 262144, True),
3325
    ('gy', 64, True),
3326
    ('guy', 64, True),
3327
    ('sh', 294912, True),  # English, but no sign for /sh/ in these languages
3328
    ('lh', 32768, True),
3329
    ('nh', 32768, True),
3330
    ('ny', 262144, True),
3331
    ('gue', 262208, True),
3332
    ('gui', 262208, True),
3333
    ('gia', 4096, True),
3334
    ('gie', 4096, True),
3335
    ('gio', 4096, True),
3336
    ('giu', 4096, True),
3337
    # 1.2 special characters
3338
    ('ñ', 262144, True),
3339
    ('â', 32832, True),
3340
    ('á', 294912, True),
3341
    ('à', 32768, True),
3342
    ('ã', 32768, True),
3343
    ('ê', 32832, True),
3344
    ('í', 294912, True),
3345
    ('î', 64, True),
3346
    ('ô', 32832, True),
3347
    ('õ', 32768, True),
3348
    ('ò', 266240, True),
3349
    ('ú', 294912, True),
3350
    ('ù', 64, True),
3351
    ('ü', 294912, True),
3352
    # Hebrew
3353
    ('א', 1024, True),
3354
    ('ב', 1024, True),
3355
    ('ג', 1024, True),
3356
    ('ד', 1024, True),
3357
    ('ה', 1024, True),
3358
    ('ו', 1024, True),
3359
    ('ז', 1024, True),
3360
    ('ח', 1024, True),
3361
    ('ט', 1024, True),
3362
    ('י', 1024, True),
3363
    ('כ', 1024, True),
3364
    ('ל', 1024, True),
3365
    ('מ', 1024, True),
3366
    ('נ', 1024, True),
3367
    ('ס', 1024, True),
3368
    ('ע', 1024, True),
3369
    ('פ', 1024, True),
3370
    ('צ', 1024, True),
3371
    ('ק', 1024, True),
3372
    ('ר', 1024, True),
3373
    ('ש', 1024, True),
3374
    ('ת', 1024, True),
3375
    # 2. following are rules to reject the language
3376
    # Every Latin character word has at least one Latin vowel
3377
    ('a', 1024, False),
3378
    ('o', 1024, False),
3379
    ('e', 1024, False),
3380
    ('i', 1024, False),
3381
    ('y', 1024, False),
3382
    ('u', 1024, False),
3383
    ('kh', 262144, False),
3384
    ('gua', 4096, False),
3385
    ('guo', 4096, False),
3386
    ('ç', 4096, False),
3387
    ('cha', 4096, False),
3388
    ('cho', 4096, False),
3389
    ('chu', 4096, False),
3390
    ('j', 4096, False),
3391
    ('dj', 262144, False),
3392
    ('sce', 64, False),
3393
    ('sci', 64, False),
3394
    ('ó', 64, False),
3395
    ('è', 32768, False),
3396
)
3397
3398
# sep/languagenames.php
3399 1
_SEP_LANGUAGES = (
3400
    'any',
3401
    'french',
3402
    'hebrew',
3403
    'italian',
3404
    'portuguese',
3405
    'spanish',
3406
)  # noqa: E501
3407
3408
# sep/rulesany.php
3409
# SEPHARDIC: INCORPORATES Portuguese + Italian + Spanish(+Catalan) + French
3410 1
_SEP_RULES_ANY = (
3411
    # CONSONANTS
3412
    ('ph', '', '', 'f'),  # foreign
3413
    ('sh', '', '', 'S'),  # foreign
3414
    ('kh', '', '', 'x'),  # foreign
3415
    ('gli', '', '', '(gli|l[4096])'),
3416
    ('gni', '', '', '(gni|ni[4160])'),
3417
    ('gn', '', '[aeou]', '(n[4160]|nj[4160]|gn)'),
3418
    ('gh', '', '', 'g'),  # It + translit. from Arabic
3419
    ('dh', '', '', 'd'),  # translit. from Arabic
3420
    ('bh', '', '', 'b'),  # translit. from Arabic
3421
    ('th', '', '', 't'),  # translit. from Arabic
3422
    ('lh', '', '', 'l'),  # Port
3423
    ('nh', '', '', 'nj'),  # Port
3424
    ('ig', '[aeiou]', '', '(ig|tS[262144])'),
3425
    ('ix', '[aeiou]', '', 'S'),  # Sp
3426
    ('tx', '', '', 'tS'),  # Sp
3427
    ('tj', '', '$', 'tS'),  # Sp
3428
    ('tj', '', '', 'dZ'),  # Sp
3429
    ('tg', '', '', '(tg|dZ[262144])'),
3430
    ('gi', '', '[aeou]', 'dZ'),  # italian
3431
    ('g', '', 'y', 'Z'),  # french
3432
    ('gg', '', '[ei]', '(gZ[32832]|dZ[266240]|x[262144])'),
3433
    ('g', '', '[ei]', '(Z[32832]|dZ[266240]|x[262144])'),
3434
    ('guy', '', '', 'gi'),
3435
    ('gue', '', '$', '(k[64]|ge)'),
3436
    ('gu', '', '[ei]', '(g|gv)'),  # not It
3437
    ('gu', '', '[ao]', 'gv'),  # not It
3438
    ('ñ', '', '', '(n|nj)'),
3439
    ('ny', '', '', 'nj'),
3440
    ('sc', '', '[ei]', '(s|S[4096])'),
3441
    ('sç', '', '[aeiou]', 's'),  # not It
3442
    ('ss', '', '', 's'),
3443
    ('ç', '', '', 's'),  # not It
3444
    ('ch', '', '[ei]', '(k[4096]|S[32832]|tS[262144]|dZ[262144])'),
3445
    ('ch', '', '', '(S|tS[262144]|dZ[262144])'),
3446
    ('ci', '', '[aeou]', '(tS[4096]|si)'),
3447
    ('cc', '', '[eiyéèê]', '(tS[4096]|ks[294976])'),
3448
    ('c', '', '[eiyéèê]', '(tS[4096]|s[294976])'),
3449
    # ("c","","[aou]","(k|C[294912])"), # "C" means that the actual letter could be "ç" (cedille omitted)  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (119/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
3450
    ('s', '^', '', 's'),
3451
    ('s', '[aáuiíoóeéêy]', '[aáuiíoóeéêy]', '(s[262144]|z[36928])'),
3452
    ('s', '', '[dglmnrv]', '(z|Z[32768])'),
3453
    (
3454
        'z',
3455
        '',
3456
        '$',
3457
        '(s|ts[4096]|S[32768])',
3458
    ),  # ts It, s/S/Z Port, s in Sp, z Fr  # noqa: E501
3459
    ('z', '', '[bdgv]', '(z|dz[4096]|Z[32768])'),  # dz It, Z/z Port, z Sp & Fr
3460
    ('z', '', '[ptckf]', '(s|ts[4096]|S[32768])'),  # ts It, s/S/z Port, z/s Sp
3461
    (
3462
        'z',
3463
        '',
3464
        '',
3465
        '(z|dz[4096]|ts[4096]|s[262144])',
3466
    ),  # ts/dz It, z Port & Fr, z/s Sp  # noqa: E501
3467
    ('que', '', '$', '(k[64]|ke)'),
3468
    ('qu', '', '[eiu]', 'k'),
3469
    ('qu', '', '[ao]', '(kv|k)'),  # k is It
3470
    ('ex', '', '[aáuiíoóeéêy]', '(ez[32768]|eS[32768]|eks|egz)'),
3471
    ('ex', '', '[cs]', '(e[32768]|ek)'),
3472
    ('m', '', '[cdglnrst]', '(m|n[32768])'),
3473
    ('m', '', '[bfpv]', '(m|n[294912])'),
3474
    ('m', '', '$', '(m|n[32768])'),
3475
    ('b', '^', '', '(b|V[262144])'),
3476
    ('v', '^', '', '(v|B[262144])'),
3477
    # VOWELS
3478
    ('eau', '', '', 'o'),  # Fr
3479
    ('ouh', '', '[aioe]', '(v[64]|uh)'),
3480
    ('uh', '', '[aioe]', '(v|uh)'),
3481
    ('ou', '', '[aioe]', 'v'),  # french
3482
    ('uo', '', '', '(vo|o)'),
3483
    ('u', '', '[aie]', 'v'),
3484
    ('i', '[aáuoóeéê]', '', 'j'),
3485
    ('i', '', '[aeou]', 'j'),
3486
    ('y', '[aáuiíoóeéê]', '', 'j'),
3487
    ('y', '', '[aeiíou]', 'j'),
3488
    ('e', '', '$', '(e|E[64])'),
3489
    ('ão', '', '', '(au|an)'),  # Port
3490
    ('ãe', '', '', '(aj|an)'),  # Port
3491
    ('ãi', '', '', '(aj|an)'),  # Port
3492
    ('õe', '', '', '(oj|on)'),  # Port
3493
    ('où', '', '', 'u'),  # Fr
3494
    ('ou', '', '', '(ou|u[64])'),
3495
    ('â', '', '', 'a'),  # Port & Fr
3496
    ('à', '', '', 'a'),  # Port
3497
    ('á', '', '', 'a'),  # Port & Sp
3498
    ('ã', '', '', '(a|an)'),  # Port
3499
    ('é', '', '', 'e'),
3500
    ('ê', '', '', 'e'),  # Port & Fr
3501
    ('è', '', '', 'e'),  # Sp & Fr & It
3502
    ('í', '', '', 'i'),  # Port & Sp
3503
    ('î', '', '', 'i'),  # Fr
3504
    ('ô', '', '', 'o'),  # Port & Fr
3505
    ('ó', '', '', 'o'),  # Port & Sp & It
3506
    ('õ', '', '', '(o|on)'),  # Port
3507
    ('ò', '', '', 'o'),  # Sp & It
3508
    ('ú', '', '', 'u'),  # Port & Sp
3509
    ('ü', '', '', 'u'),  # Port & Sp
3510
    # LATIN ALPHABET
3511
    ('a', '', '', 'a'),
3512
    ('b', '', '', '(b|v[262144])'),
3513
    ('c', '', '', 'k'),
3514
    ('d', '', '', 'd'),
3515
    ('e', '', '', 'e'),
3516
    ('f', '', '', 'f'),
3517
    ('g', '', '', 'g'),
3518
    ('h', '', '', 'h'),
3519
    ('i', '', '', 'i'),
3520
    ('j', '', '', '(x[262144]|Z)'),  # not It
3521
    ('k', '', '', 'k'),
3522
    ('l', '', '', 'l'),
3523
    ('m', '', '', 'm'),
3524
    ('n', '', '', 'n'),
3525
    ('o', '', '', 'o'),
3526
    ('p', '', '', 'p'),
3527
    ('q', '', '', 'k'),
3528
    ('r', '', '', 'r'),
3529
    ('s', '', '', '(s|S[32768])'),
3530
    ('t', '', '', 't'),
3531
    ('u', '', '', 'u'),
3532
    ('v', '', '', '(v|b[262144])'),
3533
    ('w', '', '', 'v'),  # foreign
3534
    ('x', '', '', '(ks|gz|S[294912])'),  # S/ks Port & Sp, gz Sp, It only ks
3535
    ('y', '', '', 'i'),
3536
    ('z', '', '', 'z'),
3537
)
3538
3539
# sep/rulesfrench.php
3540
3541
# Sephardic
3542 1
_SEP_RULES_FRENCH = (
3543
    # CONSONANTS
3544
    ('kh', '', '', 'x'),  # foreign
3545
    ('ph', '', '', 'f'),
3546
    ('ç', '', '', 's'),
3547
    ('x', '', '', 'ks'),
3548
    ('ch', '', '', 'S'),
3549
    ('c', '', '[eiyéèê]', 's'),
3550
    ('c', '', '', 'k'),
3551
    ('gn', '', '', '(n|gn)'),
3552
    ('g', '', '[eiy]', 'Z'),
3553
    ('gue', '', '$', 'k'),
3554
    ('gu', '', '[eiy]', 'g'),
3555
    # ("aill","","e","aj"), # non Jewish
3556
    # ("ll","","e","(l|j)"), # non Jewish
3557
    ('que', '', '$', 'k'),
3558
    ('qu', '', '', 'k'),
3559
    ('q', '', '', 'k'),
3560
    ('s', '[aeiouyéèê]', '[aeiouyéèê]', 'z'),
3561
    ('h', '[bdgt]', '', ''),  # translit from Arabic
3562
    ('h', '', '$', ''),  # foreign
3563
    ('j', '', '', 'Z'),
3564
    ('w', '', '', 'v'),
3565
    ('ouh', '', '[aioe]', '(v|uh)'),
3566
    ('ou', '', '[aeio]', 'v'),
3567
    ('uo', '', '', '(vo|o)'),
3568
    ('u', '', '[aeio]', 'v'),
3569
    # VOWELS
3570
    ('aue', '', '', 'aue'),
3571
    ('eau', '', '', 'o'),
3572
    # ("au","","","(o|au)"), # non Jewish
3573
    ('ai', '', '', 'aj'),  # [e] is non Jewish
3574
    ('ay', '', '', 'aj'),  # [e] is non Jewish
3575
    ('é', '', '', 'e'),
3576
    ('ê', '', '', 'e'),
3577
    ('è', '', '', 'e'),
3578
    ('à', '', '', 'a'),
3579
    ('â', '', '', 'a'),
3580
    ('où', '', '', 'u'),
3581
    ('ou', '', '', 'u'),
3582
    ('oi', '', '', 'oj'),  # [ua] is non Jewish
3583
    ('ei', '', '', 'ej'),  # [e] is non Jewish, in Ashk should be aj
3584
    ('ey', '', '', 'ej'),  # [e] non Jewish, in Ashk should be aj
3585
    # ("eu","","","(e|o)"), # non Jewish
3586
    ('y', '[ou]', '', 'j'),
3587
    ('e', '', '$', '(e|)'),
3588
    ('i', '', '[aou]', 'j'),
3589
    ('y', '', '[aoeu]', 'j'),
3590
    ('y', '', '', 'i'),
3591
    # TRIVIAL
3592
    ('a', '', '', 'a'),
3593
    ('b', '', '', 'b'),
3594
    ('d', '', '', 'd'),
3595
    ('e', '', '', 'e'),
3596
    ('f', '', '', 'f'),
3597
    ('g', '', '', 'g'),
3598
    ('h', '', '', 'h'),
3599
    ('i', '', '', 'i'),
3600
    ('k', '', '', 'k'),
3601
    ('l', '', '', 'l'),
3602
    ('m', '', '', 'm'),
3603
    ('n', '', '', 'n'),
3604
    ('o', '', '', 'o'),
3605
    ('p', '', '', 'p'),
3606
    ('r', '', '', 'r'),
3607
    ('s', '', '', 's'),
3608
    ('t', '', '', 't'),
3609
    ('u', '', '', 'u'),
3610
    ('v', '', '', 'v'),
3611
    ('z', '', '', 'z'),
3612
)
3613
3614
# sep/ruleshebrew.php
3615
3616
# Sephardic
3617 1
_SEP_RULES_HEBREW = (
3618
    ('אי', '', '', 'i'),
3619
    ('עי', '', '', 'i'),
3620
    ('עו', '', '', 'VV'),
3621
    ('או', '', '', 'VV'),
3622
    ('ג׳', '', '', 'Z'),
3623
    ('ד׳', '', '', 'dZ'),
3624
    ('א', '', '', 'L'),
3625
    ('ב', '', '', 'b'),
3626
    ('ג', '', '', 'g'),
3627
    ('ד', '', '', 'd'),
3628
    ('ה', '^', '', '1'),
3629
    ('ה', '', '$', '1'),
3630
    ('ה', '', '', ''),
3631
    ('וו', '', '', 'V'),
3632
    ('וי', '', '', 'WW'),
3633
    ('ו', '', '', 'W'),
3634
    ('ז', '', '', 'z'),
3635
    ('ח', '', '', 'X'),
3636
    ('ט', '', '', 'T'),
3637
    ('יי', '', '', 'i'),
3638
    ('י', '', '', 'i'),
3639
    ('ך', '', '', 'X'),
3640
    ('כ', '^', '', 'K'),
3641
    ('כ', '', '', 'k'),
3642
    ('ל', '', '', 'l'),
3643
    ('ם', '', '', 'm'),
3644
    ('מ', '', '', 'm'),
3645
    ('ן', '', '', 'n'),
3646
    ('נ', '', '', 'n'),
3647
    ('ס', '', '', 's'),
3648
    ('ע', '', '', 'L'),
3649
    ('ף', '', '', 'f'),
3650
    ('פ', '', '', 'f'),
3651
    ('ץ', '', '', 'C'),
3652
    ('צ', '', '', 'C'),
3653
    ('ק', '', '', 'K'),
3654
    ('ר', '', '', 'r'),
3655
    ('ש', '', '', 's'),
3656
    ('ת', '', '', 'T'),  # Special for Sephardim
3657
)
3658
3659
# sep/rulesitalian.php
3660
3661 1
_SEP_RULES_ITALIAN = (
3662
    ('kh', '', '', 'x'),  # foreign
3663
    ('gli', '', '', '(l|gli)'),
3664
    ('gn', '', '[aeou]', '(n|nj|gn)'),
3665
    ('gni', '', '', '(ni|gni)'),
3666
    ('gi', '', '[aeou]', 'dZ'),
3667
    ('gg', '', '[ei]', 'dZ'),
3668
    ('g', '', '[ei]', 'dZ'),
3669
    ('h', '[bdgt]', '', 'g'),  # gh is It; others from Arabic translit
3670
    ('ci', '', '[aeou]', 'tS'),
3671
    ('ch', '', '[ei]', 'k'),
3672
    ('sc', '', '[ei]', 'S'),
3673
    ('cc', '', '[ei]', 'tS'),
3674
    ('c', '', '[ei]', 'tS'),
3675
    ('s', '[aeiou]', '[aeiou]', 'z'),
3676
    ('i', '[aeou]', '', 'j'),
3677
    ('i', '', '[aeou]', 'j'),
3678
    ('y', '[aeou]', '', 'j'),  # foreign
3679
    ('y', '', '[aeou]', 'j'),  # foreign
3680
    ('qu', '', '', 'k'),
3681
    ('uo', '', '', '(vo|o)'),
3682
    ('u', '', '[aei]', 'v'),
3683
    ('è', '', '', 'e'),
3684
    ('é', '', '', 'e'),
3685
    ('ò', '', '', 'o'),
3686
    ('ó', '', '', 'o'),
3687
    # LATIN ALPHABET
3688
    ('a', '', '', 'a'),
3689
    ('b', '', '', 'b'),
3690
    ('c', '', '', 'k'),
3691
    ('d', '', '', 'd'),
3692
    ('e', '', '', 'e'),
3693
    ('f', '', '', 'f'),
3694
    ('g', '', '', 'g'),
3695
    ('h', '', '', 'h'),
3696
    ('i', '', '', 'i'),
3697
    ('j', '', '', '(Z|dZ|j)'),  # foreign
3698
    ('k', '', '', 'k'),
3699
    ('l', '', '', 'l'),
3700
    ('m', '', '', 'm'),
3701
    ('n', '', '', 'n'),
3702
    ('o', '', '', 'o'),
3703
    ('p', '', '', 'p'),
3704
    ('q', '', '', 'k'),
3705
    ('r', '', '', 'r'),
3706
    ('s', '', '', 's'),
3707
    ('t', '', '', 't'),
3708
    ('u', '', '', 'u'),
3709
    ('v', '', '', 'v'),
3710
    ('w', '', '', 'v'),  # foreign
3711
    ('x', '', '', 'ks'),  # foreign
3712
    ('y', '', '', 'i'),  # foreign
3713
    ('z', '', '', '(ts|dz)'),
3714
)
3715
3716
# sep/rulesportuguese.php
3717
3718 1
_SEP_RULES_PORTUGUESE = (
3719
    ('kh', '', '', 'x'),  # foreign
3720
    ('ch', '', '', 'S'),
3721
    ('ss', '', '', 's'),
3722
    ('sc', '', '[ei]', 's'),
3723
    ('sç', '', '[aou]', 's'),
3724
    ('ç', '', '', 's'),
3725
    ('c', '', '[ei]', 's'),
3726
    # ("c","","[aou]","(k|C)"),
3727
    ('s', '^', '', 's'),
3728
    ('s', '[aáuiíoóeéêy]', '[aáuiíoóeéêy]', 'z'),
3729
    ('s', '', '[dglmnrv]', '(Z|S)'),  # Z is Brazil
3730
    ('z', '', '$', '(Z|s|S)'),  # s and S in Brazil
3731
    ('z', '', '[bdgv]', '(Z|z)'),  # Z in Brazil
3732
    ('z', '', '[ptckf]', '(s|S|z)'),  # s and S in Brazil
3733
    ('gu', '', '[eiu]', 'g'),
3734
    ('gu', '', '[ao]', 'gv'),
3735
    ('g', '', '[ei]', 'Z'),
3736
    ('qu', '', '[eiu]', 'k'),
3737
    ('qu', '', '[ao]', 'kv'),
3738
    ('uo', '', '', '(vo|o|u)'),
3739
    ('u', '', '[aei]', 'v'),
3740
    ('lh', '', '', 'l'),
3741
    ('nh', '', '', 'nj'),
3742
    ('h', '[bdgt]', '', ''),  # translit. from Arabic
3743
    ('ex', '', '[aáuiíoóeéêy]', '(ez|eS|eks)'),  # ez in Brazil
3744
    ('ex', '', '[cs]', 'e'),
3745
    ('y', '[aáuiíoóeéê]', '', 'j'),
3746
    ('y', '', '[aeiíou]', 'j'),
3747
    (
3748
        'm',
3749
        '',
3750
        '[bcdfglnprstv]',
3751
        '(m|n)',
3752
    ),  # maybe to add a rule for m/n before a consonant that disappears [preceeding vowel becomes nasalized]  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (123/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
3753
    (
3754
        'm',
3755
        '',
3756
        '$',
3757
        '(m|n)',
3758
    ),  # maybe to add a rule for final m/n that disappears [preceeding vowel becomes nasalized]  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (110/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
3759
    ('ão', '', '', '(au|an|on)'),
3760
    ('ãe', '', '', '(aj|an)'),
3761
    ('ãi', '', '', '(aj|an)'),
3762
    ('õe', '', '', '(oj|on)'),
3763
    ('i', '[aáuoóeéê]', '', 'j'),
3764
    ('i', '', '[aeou]', 'j'),
3765
    ('â', '', '', 'a'),
3766
    ('à', '', '', 'a'),
3767
    ('á', '', '', 'a'),
3768
    ('ã', '', '', '(a|an|on)'),
3769
    ('é', '', '', 'e'),
3770
    ('ê', '', '', 'e'),
3771
    ('í', '', '', 'i'),
3772
    ('ô', '', '', 'o'),
3773
    ('ó', '', '', 'o'),
3774
    ('õ', '', '', '(o|on)'),
3775
    ('ú', '', '', 'u'),
3776
    ('ü', '', '', 'u'),
3777
    ('aue', '', '', 'aue'),
3778
    # LATIN ALPHABET
3779
    ('a', '', '', 'a'),
3780
    ('b', '', '', 'b'),
3781
    ('c', '', '', 'k'),
3782
    ('d', '', '', 'd'),
3783
    ('e', '', '', '(e|i)'),
3784
    ('f', '', '', 'f'),
3785
    ('g', '', '', 'g'),
3786
    ('h', '', '', 'h'),
3787
    ('i', '', '', 'i'),
3788
    ('j', '', '', 'Z'),
3789
    ('k', '', '', 'k'),
3790
    ('l', '', '', 'l'),
3791
    ('m', '', '', 'm'),
3792
    ('n', '', '', 'n'),
3793
    ('o', '', '', '(o|u)'),
3794
    ('p', '', '', 'p'),
3795
    ('q', '', '', 'k'),
3796
    ('r', '', '', 'r'),
3797
    ('s', '', '', 'S'),
3798
    ('t', '', '', 't'),
3799
    ('u', '', '', 'u'),
3800
    ('v', '', '', 'v'),
3801
    ('w', '', '', 'v'),
3802
    ('x', '', '', '(S|ks)'),
3803
    ('y', '', '', 'i'),
3804
    ('z', '', '', 'z'),
3805
)
3806
3807
# sep/rulesspanish.php
3808
3809
# Sephardic
3810 1
_SEP_RULES_SPANISH = (
3811
    # Includes both Spanish (Castillian) & Catalan
3812
    # CONSONANTS
3813
    ('ñ', '', '', '(n|nj)'),
3814
    ('ny', '', '', 'nj'),  # Catalan
3815
    ('ç', '', '', 's'),  # Catalan
3816
    ('ig', '[aeiou]', '', '(tS|ig)'),  # tS is Catalan
3817
    ('ix', '[aeiou]', '', 'S'),  # Catalan
3818
    ('tx', '', '', 'tS'),  # Catalan
3819
    ('tj', '', '$', 'tS'),  # Catalan
3820
    ('tj', '', '', 'dZ'),  # Catalan
3821
    ('tg', '', '', '(tg|dZ)'),  # dZ is Catalan
3822
    ('ch', '', '', '(tS|dZ)'),  # dZ is typical for Argentina
3823
    ('bh', '', '', 'b'),  # translit. from Arabic
3824
    ('h', '[dgt]', '', ''),  # translit. from Arabic
3825
    ('j', '', '', '(x|Z)'),  # Z is Catalan
3826
    ('x', '', '', '(ks|gz|S)'),  # ks is Spanish, all are Catalan
3827
    # ("ll","","","(l|Z)"), # Z is typical for Argentina, only Ashkenazic
3828
    ('w', '', '', 'v'),  # foreign words
3829
    ('v', '^', '', '(B|v)'),
3830
    ('b', '^', '', '(b|V)'),
3831
    ('v', '', '', '(b|v)'),
3832
    ('b', '', '', '(b|v)'),
3833
    ('m', '', '[bpvf]', '(m|n)'),
3834
    ('c', '', '[ei]', 's'),
3835
    # ("c","","[aou]","(k|C)"),
3836
    ('c', '', '', 'k'),
3837
    (
3838
        'z',
3839
        '',
3840
        '',
3841
        '(z|s)',
3842
    ),  # as "c" befoire "e" or "i", in Spain it is like unvoiced English "th"  # noqa: E501
3843
    ('gu', '', '[ei]', '(g|gv)'),  # "gv" because "u" can actually be "ü"
3844
    ('g', '', '[ei]', '(x|g|dZ)'),  # "g" only for foreign words; dZ is Catalan
3845
    ('qu', '', '', 'k'),
3846
    ('q', '', '', 'k'),
3847
    ('uo', '', '', '(vo|o)'),
3848
    ('u', '', '[aei]', 'v'),
3849
    # ("y","","","(i|j|S|Z)"), # S or Z are peculiar to South America; only Ashkenazic  # noqa: E501
3850
    ('y', '', '', '(i|j)'),
3851
    # VOWELS
3852
    ('ü', '', '', 'v'),
3853
    ('á', '', '', 'a'),
3854
    ('é', '', '', 'e'),
3855
    ('í', '', '', 'i'),
3856
    ('ó', '', '', 'o'),
3857
    ('ú', '', '', 'u'),
3858
    ('à', '', '', 'a'),  # Catalan
3859
    ('è', '', '', 'e'),  # Catalan
3860
    ('ò', '', '', 'o'),  # Catalan
3861
    # TRIVIAL
3862
    ('a', '', '', 'a'),
3863
    ('d', '', '', 'd'),
3864
    ('e', '', '', 'e'),
3865
    ('f', '', '', 'f'),
3866
    ('g', '', '', 'g'),
3867
    ('h', '', '', 'h'),
3868
    ('i', '', '', 'i'),
3869
    ('k', '', '', 'k'),
3870
    ('l', '', '', 'l'),
3871
    ('m', '', '', 'm'),
3872
    ('n', '', '', 'n'),
3873
    ('o', '', '', 'o'),
3874
    ('p', '', '', 'p'),
3875
    ('r', '', '', 'r'),
3876
    ('s', '', '', 's'),
3877
    ('t', '', '', 't'),
3878
    ('u', '', '', 'u'),
3879
)
3880
3881
# ash/approxany.php
3882
3883
# ASHKENAZIC
3884
3885
# A, E, I, O, P, U should create variants, but a, e, i, o, u should not create any new variant  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (108/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
3886
# Q = ü ; Y = ä = ö
3887
# H = initial "H" in German/English
3888 1
_ASH_APPROX_ANY = (
3889
    # CONSONANTS
3890
    ('b', '', '', '(b|v[262144])'),
3891
    (
3892
        'J',
3893
        '',
3894
        '',
3895
        'z',
3896
    ),  # Argentina Spanish: "ll" = /Z/, but approximately /Z/ = /z/  # noqa: E501
3897
    # VOWELS
3898
    # "ALL" DIPHTHONGS are interchangeable BETWEEN THEM and with monophthongs of which they are composed ("D" means "diphthong")  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (142/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
3899
    # {a,o} are totally interchangeable if non-stressed; in German "a/o" can actually be from "ä/ö" (that are equivalent to "e")  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (142/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
3900
    # {i,e} are interchangeable if non-stressed, while in German "u" can actually be from "ü" (that is equivalent to "i")  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (135/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
3901
    ('aiB', '', '[bp]', '(D|Dm)'),
3902
    ('AiB', '', '[bp]', '(D|Dm)'),
3903
    ('oiB', '', '[bp]', '(D|Dm)'),
3904
    ('OiB', '', '[bp]', '(D|Dm)'),
3905
    ('uiB', '', '[bp]', '(D|Dm)'),
3906
    ('UiB', '', '[bp]', '(D|Dm)'),
3907
    ('eiB', '', '[bp]', '(D|Dm)'),
3908
    ('EiB', '', '[bp]', '(D|Dm)'),
3909
    ('iiB', '', '[bp]', '(D|Dm)'),
3910
    ('IiB', '', '[bp]', '(D|Dm)'),
3911
    ('aiB', '', '[dgkstvz]', '(D|Dn)'),
3912
    ('AiB', '', '[dgkstvz]', '(D|Dn)'),
3913
    ('oiB', '', '[dgkstvz]', '(D|Dn)'),
3914
    ('OiB', '', '[dgkstvz]', '(D|Dn)'),
3915
    ('uiB', '', '[dgkstvz]', '(D|Dn)'),
3916
    ('UiB', '', '[dgkstvz]', '(D|Dn)'),
3917
    ('eiB', '', '[dgkstvz]', '(D|Dn)'),
3918
    ('EiB', '', '[dgkstvz]', '(D|Dn)'),
3919
    ('iiB', '', '[dgkstvz]', '(D|Dn)'),
3920
    ('IiB', '', '[dgkstvz]', '(D|Dn)'),
3921
    ('B', '', '[bp]', '(o|om[16384]|im[16384])'),
3922
    ('B', '', '[dgkstvz]', '(a|o|on[16384]|in[16384])'),
3923
    ('B', '', '', '(a|o)'),
3924
    ('aiF', '', '[bp]', '(D|Dm)'),
3925
    ('AiF', '', '[bp]', '(D|Dm)'),
3926
    ('oiF', '', '[bp]', '(D|Dm)'),
3927
    ('OiF', '', '[bp]', '(D|Dm)'),
3928
    ('uiF', '', '[bp]', '(D|Dm)'),
3929
    ('UiF', '', '[bp]', '(D|Dm)'),
3930
    ('eiF', '', '[bp]', '(D|Dm)'),
3931
    ('EiF', '', '[bp]', '(D|Dm)'),
3932
    ('iiF', '', '[bp]', '(D|Dm)'),
3933
    ('IiF', '', '[bp]', '(D|Dm)'),
3934
    ('aiF', '', '[dgkstvz]', '(D|Dn)'),
3935
    ('AiF', '', '[dgkstvz]', '(D|Dn)'),
3936
    ('oiF', '', '[dgkstvz]', '(D|Dn)'),
3937
    ('OiF', '', '[dgkstvz]', '(D|Dn)'),
3938
    ('uiF', '', '[dgkstvz]', '(D|Dn)'),
3939
    ('UiF', '', '[dgkstvz]', '(D|Dn)'),
3940
    ('eiF', '', '[dgkstvz]', '(D|Dn)'),
3941
    ('EiF', '', '[dgkstvz]', '(D|Dn)'),
3942
    ('iiF', '', '[dgkstvz]', '(D|Dn)'),
3943
    ('IiF', '', '[dgkstvz]', '(D|Dn)'),
3944
    ('F', '', '[bp]', '(i|im[16384]|om[16384])'),
3945
    ('F', '', '[dgkstvz]', '(i|in[16384]|on[16384])'),
3946
    ('F', '', '', 'i'),
3947
    ('P', '', '', '(o|u)'),
3948
    ('I', '[aeiouAEIBFOUQY]', '', 'i'),
3949
    ('I', '', '[^aeiouAEBFIOU]e', '(Q[128]|i|D[32])'),  # "line"
3950
    ('I', '', '$', 'i'),
3951
    ('I', '', '[^k]$', 'i'),
3952
    ('Ik', '[lr]', '$', '(ik|Qk[128])'),
3953
    ('Ik', '', '$', 'ik'),
3954
    ('sIts', '', '$', '(sits|sQts[128])'),
3955
    ('Its', '', '$', 'its'),
3956
    ('I', '', '', '(Q[128]|i)'),
3957
    ('lE', '[bdfgkmnprsStvzZ]', '$', '(li|il[32])'),  # Apple < Appel
3958
    (
3959
        'lE',
3960
        '[bdfgkmnprsStvzZ]',
3961
        '',
3962
        '(li|il[32]|lY[128])',
3963
    ),  # Applebaum < Appelbaum  # noqa: E501
3964
    ('au', '', '', '(D|a|u)'),
3965
    ('ou', '', '', '(D|o|u)'),
3966
    ('ai', '', '', '(D|a|i)'),
3967
    ('Ai', '', '', '(D|a|i)'),
3968
    ('oi', '', '', '(D|o|i)'),
3969
    ('Oi', '', '', '(D|o|i)'),
3970
    ('ui', '', '', '(D|u|i)'),
3971
    ('Ui', '', '', '(D|u|i)'),
3972
    ('ei', '', '', '(D|i)'),
3973
    ('Ei', '', '', '(D|i)'),
3974
    ('iA', '', '$', '(ia|io)'),
3975
    ('iA', '', '', '(ia|io|iY[128])'),
3976
    ('A', '', '[^aeiouAEBFIOU]e', '(a|o|Y[128]|D[32])'),  # "plane"
3977
    (
3978
        'E',
3979
        'i[^aeiouAEIOU]',
3980
        '',
3981
        '(i|Y[128]|[32])',
3982
    ),  # Wineberg (vineberg/vajneberg) --> vajnberg  # noqa: E501
3983
    (
3984
        'E',
3985
        'a[^aeiouAEIOU]',
3986
        '',
3987
        '(i|Y[128]|[32])',
3988
    ),  # Shaneberg (shaneberg/shejneberg) --> shejnberg  # noqa: E501
3989
    ('e', '', '[fklmnprstv]$', 'i'),
3990
    ('e', '', 'ts$', 'i'),
3991
    ('e', '', '$', 'i'),
3992
    ('e', '[DaoiuAOIUQY]', '', 'i'),
3993
    ('e', '', '[aoAOQY]', 'i'),
3994
    ('e', '', '', '(i|Y[128])'),
3995
    ('E', '', '[fklmnprst]$', 'i'),
3996
    ('E', '', 'ts$', 'i'),
3997
    ('E', '', '$', 'i'),
3998
    ('E', '[DaoiuAOIUQY]', '', 'i'),
3999
    ('E', '', '[aoAOQY]', 'i'),
4000
    ('E', '', '', '(i|Y[128])'),
4001
    ('a', '', '', '(a|o)'),
4002
    ('O', '', '[fklmnprstv]$', 'o'),
4003
    ('O', '', 'ts$', 'o'),
4004
    ('O', '', '$', 'o'),
4005
    ('O', '[oeiuQY]', '', 'o'),
4006
    ('O', '', '', '(o|Y[128])'),
4007
    ('A', '', '[fklmnprst]$', '(a|o)'),
4008
    ('A', '', 'ts$', '(a|o)'),
4009
    ('A', '', '$', '(a|o)'),
4010
    ('A', '[oeiuQY]', '', '(a|o)'),
4011
    ('A', '', '', '(a|o|Y[128])'),
4012
    ('U', '', '$', 'u'),
4013
    ('U', '[DoiuQY]', '', 'u'),
4014
    ('U', '', '[^k]$', 'u'),
4015
    ('Uk', '[lr]', '$', '(uk|Qk[128])'),
4016
    ('Uk', '', '$', 'uk'),
4017
    ('sUts', '', '$', '(suts|sQts[128])'),
4018
    ('Uts', '', '$', 'uts'),
4019
    ('U', '', '', '(u|Q[128])'),
4020
)
4021
4022
# ash/approxcommon.php
4023
# Ashkenazic
4024
4025 1
_ASH_APPROX_COMMON = (
4026
    # REGRESSIVE ASSIMILATION OF CONSONANTS
4027
    ('n', '', '[bp]', 'm'),
4028
    # PECULIARITY OF "h"
4029
    ('h', '', '', ''),
4030
    ('H', '', '', '(x|)'),
4031
    # POLISH OGONEK IMPOSSIBLE
4032
    ('F', '', '[bdgkpstvzZ]h', 'e'),
4033
    ('F', '', '[bdgkpstvzZ]x', 'e'),
4034
    ('B', '', '[bdgkpstvzZ]h', 'a'),
4035
    ('B', '', '[bdgkpstvzZ]x', 'a'),
4036
    # "e" and "i" ARE TO BE OMITTED BEFORE (SYLLABIC) n & l: Halperin=Halpern; Frankel = Frankl, Finkelstein = Finklstein  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (135/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
4037
    ('e', '[bdfgklmnprsStvzZ]', '[ln]$', ''),
4038
    ('i', '[bdfgklmnprsStvzZ]', '[ln]$', ''),
4039
    ('E', '[bdfgklmnprsStvzZ]', '[ln]$', ''),
4040
    ('I', '[bdfgklmnprsStvzZ]', '[ln]$', ''),
4041
    ('F', '[bdfgklmnprsStvzZ]', '[ln]$', ''),
4042
    ('Q', '[bdfgklmnprsStvzZ]', '[ln]$', ''),
4043
    ('Y', '[bdfgklmnprsStvzZ]', '[ln]$', ''),
4044
    ('e', '[bdfgklmnprsStvzZ]', '[ln][bdfgklmnprsStvzZ]', '(e|)'),
4045
    ('i', '[bdfgklmnprsStvzZ]', '[ln][bdfgklmnprsStvzZ]', '(i|)'),
4046
    ('E', '[bdfgklmnprsStvzZ]', '[ln][bdfgklmnprsStvzZ]', '(E|)'),
4047
    ('I', '[bdfgklmnprsStvzZ]', '[ln][bdfgklmnprsStvzZ]', '(I|)'),
4048
    ('F', '[bdfgklmnprsStvzZ]', '[ln][bdfgklmnprsStvzZ]', '(F|)'),
4049
    ('Q', '[bdfgklmnprsStvzZ]', '[ln][bdfgklmnprsStvzZ]', '(Q|)'),
4050
    ('Y', '[bdfgklmnprsStvzZ]', '[ln][bdfgklmnprsStvzZ]', '(Y|)'),
4051
    (
4052
        'lEs',
4053
        '',
4054
        '',
4055
        '(lEs|lz)',
4056
    ),  # Applebaum < Appelbaum (English + blend English-something forms as Finklestein)  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (102/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
4057
    (
4058
        'lE',
4059
        '[bdfgkmnprStvzZ]',
4060
        '',
4061
        '(lE|l)',
4062
    ),  # Applebaum < Appelbaum (English + blend English-something forms as Finklestein)  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (102/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
4063
    # SIMPLIFICATION: (TRIPHTHONGS & DIPHTHONGS) -> ONE GENERIC DIPHTHONG "D"
4064
    ('aue', '', '', 'D'),
4065
    ('oue', '', '', 'D'),
4066
    ('AvE', '', '', '(D|AvE)'),
4067
    ('Ave', '', '', '(D|Ave)'),
4068
    ('avE', '', '', '(D|avE)'),
4069
    ('ave', '', '', '(D|ave)'),
4070
    ('OvE', '', '', '(D|OvE)'),
4071
    ('Ove', '', '', '(D|Ove)'),
4072
    ('ovE', '', '', '(D|ovE)'),
4073
    ('ove', '', '', '(D|ove)'),
4074
    ('ea', '', '', '(D|ea)'),
4075
    ('EA', '', '', '(D|EA)'),
4076
    ('Ea', '', '', '(D|Ea)'),
4077
    ('eA', '', '', '(D|eA)'),
4078
    ('aji', '', '', 'D'),
4079
    ('ajI', '', '', 'D'),
4080
    ('aje', '', '', 'D'),
4081
    ('ajE', '', '', 'D'),
4082
    ('Aji', '', '', 'D'),
4083
    ('AjI', '', '', 'D'),
4084
    ('Aje', '', '', 'D'),
4085
    ('AjE', '', '', 'D'),
4086
    ('oji', '', '', 'D'),
4087
    ('ojI', '', '', 'D'),
4088
    ('oje', '', '', 'D'),
4089
    ('ojE', '', '', 'D'),
4090
    ('Oji', '', '', 'D'),
4091
    ('OjI', '', '', 'D'),
4092
    ('Oje', '', '', 'D'),
4093
    ('OjE', '', '', 'D'),
4094
    ('eji', '', '', 'D'),
4095
    ('ejI', '', '', 'D'),
4096
    ('eje', '', '', 'D'),
4097
    ('ejE', '', '', 'D'),
4098
    ('Eji', '', '', 'D'),
4099
    ('EjI', '', '', 'D'),
4100
    ('Eje', '', '', 'D'),
4101
    ('EjE', '', '', 'D'),
4102
    ('uji', '', '', 'D'),
4103
    ('ujI', '', '', 'D'),
4104
    ('uje', '', '', 'D'),
4105
    ('ujE', '', '', 'D'),
4106
    ('Uji', '', '', 'D'),
4107
    ('UjI', '', '', 'D'),
4108
    ('Uje', '', '', 'D'),
4109
    ('UjE', '', '', 'D'),
4110
    ('iji', '', '', 'D'),
4111
    ('ijI', '', '', 'D'),
4112
    ('ije', '', '', 'D'),
4113
    ('ijE', '', '', 'D'),
4114
    ('Iji', '', '', 'D'),
4115
    ('IjI', '', '', 'D'),
4116
    ('Ije', '', '', 'D'),
4117
    ('IjE', '', '', 'D'),
4118
    ('aja', '', '', 'D'),
4119
    ('ajA', '', '', 'D'),
4120
    ('ajo', '', '', 'D'),
4121
    ('ajO', '', '', 'D'),
4122
    ('aju', '', '', 'D'),
4123
    ('ajU', '', '', 'D'),
4124
    ('Aja', '', '', 'D'),
4125
    ('AjA', '', '', 'D'),
4126
    ('Ajo', '', '', 'D'),
4127
    ('AjO', '', '', 'D'),
4128
    ('Aju', '', '', 'D'),
4129
    ('AjU', '', '', 'D'),
4130
    ('oja', '', '', 'D'),
4131
    ('ojA', '', '', 'D'),
4132
    ('ojo', '', '', 'D'),
4133
    ('ojO', '', '', 'D'),
4134
    ('Aju', '', '', 'D'),
4135
    ('AjU', '', '', 'D'),
4136
    ('Oja', '', '', 'D'),
4137
    ('OjA', '', '', 'D'),
4138
    ('Ojo', '', '', 'D'),
4139
    ('OjO', '', '', 'D'),
4140
    ('Aju', '', '', 'D'),
4141
    ('AjU', '', '', 'D'),
4142
    ('eja', '', '', 'D'),
4143
    ('ejA', '', '', 'D'),
4144
    ('ejo', '', '', 'D'),
4145
    ('ejO', '', '', 'D'),
4146
    ('Aju', '', '', 'D'),
4147
    ('AjU', '', '', 'D'),
4148
    ('Eja', '', '', 'D'),
4149
    ('EjA', '', '', 'D'),
4150
    ('Ejo', '', '', 'D'),
4151
    ('EjO', '', '', 'D'),
4152
    ('Aju', '', '', 'D'),
4153
    ('AjU', '', '', 'D'),
4154
    ('uja', '', '', 'D'),
4155
    ('ujA', '', '', 'D'),
4156
    ('ujo', '', '', 'D'),
4157
    ('ujO', '', '', 'D'),
4158
    ('Aju', '', '', 'D'),
4159
    ('AjU', '', '', 'D'),
4160
    ('Uja', '', '', 'D'),
4161
    ('UjA', '', '', 'D'),
4162
    ('Ujo', '', '', 'D'),
4163
    ('UjO', '', '', 'D'),
4164
    ('Aju', '', '', 'D'),
4165
    ('AjU', '', '', 'D'),
4166
    ('ija', '', '', 'D'),
4167
    ('ijA', '', '', 'D'),
4168
    ('ijo', '', '', 'D'),
4169
    ('ijO', '', '', 'D'),
4170
    ('Aju', '', '', 'D'),
4171
    ('AjU', '', '', 'D'),
4172
    ('Ija', '', '', 'D'),
4173
    ('IjA', '', '', 'D'),
4174
    ('Ijo', '', '', 'D'),
4175
    ('IjO', '', '', 'D'),
4176
    ('Aju', '', '', 'D'),
4177
    ('AjU', '', '', 'D'),
4178
    ('j', '', '', 'i'),
4179
    # lander = lender = länder
4180
    ('lYndEr', '', '$', 'lYnder'),
4181
    ('lander', '', '$', 'lYnder'),
4182
    ('lAndEr', '', '$', 'lYnder'),
4183
    ('lAnder', '', '$', 'lYnder'),
4184
    ('landEr', '', '$', 'lYnder'),
4185
    ('lender', '', '$', 'lYnder'),
4186
    ('lEndEr', '', '$', 'lYnder'),
4187
    ('lendEr', '', '$', 'lYnder'),
4188
    ('lEnder', '', '$', 'lYnder'),
4189
    # burg = berg
4190
    ('bUrk', '', '$', '(burk|berk)'),
4191
    ('burk', '', '$', '(burk|berk)'),
4192
    ('bUrg', '', '$', '(burk|berk)'),
4193
    ('burg', '', '$', '(burk|berk)'),
4194
    # CONSONANTS {z & Z; s & S} are approximately interchangeable
4195
    ('s', '', '[rmnl]', 'z'),
4196
    ('S', '', '[rmnl]', 'z'),
4197
    ('s', '[rmnl]', '', 'z'),
4198
    ('S', '[rmnl]', '', 'z'),
4199
    ('dS', '', '$', 'S'),
4200
    ('dZ', '', '$', 'S'),
4201
    ('Z', '', '$', 'S'),
4202
    ('S', '', '$', '(S|s)'),
4203
    ('z', '', '$', '(S|s)'),
4204
    ('S', '', '', 's'),
4205
    ('dZ', '', '', 'z'),
4206
    ('Z', '', '', 'z'),
4207
)
4208
4209
# ash/approxcyrillic.php
4210
# this file uses the same rules as approxrussian.php
4211
4212
# ash/approxenglish.php
4213
4214 1
_ASH_APPROX_ENGLISH = (
4215
    # VOWELS
4216
    ('I', '', '[^aEIeiou]e', '(Q|i|D)'),  # like in "five"
4217
    ('I', '', '$', 'i'),
4218
    ('I', '[aEIeiou]', '', 'i'),
4219
    ('I', '', '[^k]$', 'i'),
4220
    ('Ik', '[lr]', '$', '(ik|Qk)'),
4221
    ('Ik', '', '$', 'ik'),
4222
    ('sIts', '', '$', '(sits|sQts)'),
4223
    ('Its', '', '$', 'its'),
4224
    ('I', '', '', '(i|Q)'),
4225
    ('lE', '[bdfgkmnprsStvzZ]', '', '(il|li|lY)'),  # Applebaum < Appelbaum
4226
    ('au', '', '', '(D|a|u)'),
4227
    ('ou', '', '', '(D|o|u)'),
4228
    ('ai', '', '', '(D|a|i)'),
4229
    ('oi', '', '', '(D|o|i)'),
4230
    ('ui', '', '', '(D|u|i)'),
4231
    (
4232
        'E',
4233
        'D[^aeiEIou]',
4234
        '',
4235
        '(i|)',
4236
    ),  # Weinberg, Shaneberg (shaneberg/shejneberg) --> shejnberg  # noqa: E501
4237
    ('e', 'D[^aeiEIou]', '', '(i|)'),
4238
    ('e', '', '', 'i'),
4239
    ('E', '', '[fklmnprsStv]$', 'i'),
4240
    ('E', '', 'ts$', 'i'),
4241
    ('E', '[DaoiEuQY]', '', 'i'),
4242
    ('E', '', '[aoQY]', 'i'),
4243
    ('E', '', '', '(Y|i)'),
4244
    ('a', '', '', '(a|o)'),
4245
)
4246
4247
# ash/approxfrench.php
4248
# THE LINES BELOW WERE VALID FOR ASHKENAZIM
4249
4250 1
_ASH_APPROX_FRENCH = (
4251
    ('I', '', '$', 'i'),
4252
    ('I', '[aEIeiou]', '', 'i'),
4253
    ('I', '', '[^k]$', 'i'),
4254
    ('Ik', '[lr]', '$', '(ik|Qk)'),
4255
    ('Ik', '', '$', 'ik'),
4256
    ('sIts', '', '$', '(sits|sQts)'),
4257
    ('Its', '', '$', 'its'),
4258
    ('I', '', '', '(i|Q)'),
4259
    ('au', '', '', '(D|a|u)'),
4260
    ('ou', '', '', '(D|o|u)'),
4261
    ('ai', '', '', '(D|a|i)'),
4262
    ('oi', '', '', '(D|o|i)'),
4263
    ('ui', '', '', '(D|u|i)'),
4264
    ('a', '', '', '(a|o)'),
4265
    ('e', '', '', 'i'),
4266
    ('E', '', '[fklmnprsStv]$', 'i'),
4267
    ('E', '', 'ts$', 'i'),
4268
    ('E', '[aoiuQ]', '', 'i'),
4269
    ('E', '', '[aoQ]', 'i'),
4270
    ('E', '', '', '(Y|i)'),
4271
)
4272
4273
# ash/approxgerman.php
4274
4275 1
_ASH_APPROX_GERMAN = (
4276
    ('I', '', '$', 'i'),
4277
    ('I', '[aeiAEIOUouQY]', '', 'i'),
4278
    ('I', '', '[^k]$', 'i'),
4279
    ('Ik', '[lr]', '$', '(ik|Qk)'),
4280
    ('Ik', '', '$', 'ik'),
4281
    ('sIts', '', '$', '(sits|sQts)'),
4282
    ('Its', '', '$', 'its'),
4283
    ('I', '', '', '(Q|i)'),
4284
    ('AU', '', '', '(D|a|u)'),
4285
    ('aU', '', '', '(D|a|u)'),
4286
    ('Au', '', '', '(D|a|u)'),
4287
    ('au', '', '', '(D|a|u)'),
4288
    ('ou', '', '', '(D|o|u)'),
4289
    ('OU', '', '', '(D|o|u)'),
4290
    ('oU', '', '', '(D|o|u)'),
4291
    ('Ou', '', '', '(D|o|u)'),
4292
    ('ai', '', '', '(D|a|i)'),
4293
    ('Ai', '', '', '(D|a|i)'),
4294
    ('oi', '', '', '(D|o|i)'),
4295
    ('Oi', '', '', '(D|o|i)'),
4296
    ('ui', '', '', '(D|u|i)'),
4297
    ('Ui', '', '', '(D|u|i)'),
4298
    ('e', '', '', 'i'),
4299
    ('E', '', '[fklmnprst]$', 'i'),
4300
    ('E', '', 'ts$', 'i'),
4301
    ('E', '', '$', 'i'),
4302
    ('E', '[DaoAOUiuQY]', '', 'i'),
4303
    ('E', '', '[aoAOQY]', 'i'),
4304
    ('E', '', '', '(Y|i)'),
4305
    ('O', '', '$', 'o'),
4306
    ('O', '', '[fklmnprst]$', 'o'),
4307
    ('O', '', 'ts$', 'o'),
4308
    ('O', '[aoAOUeiuQY]', '', 'o'),
4309
    ('O', '', '', '(o|Y)'),
4310
    ('a', '', '', '(a|o)'),
4311
    ('A', '', '$', '(a|o)'),
4312
    ('A', '', '[fklmnprst]$', '(a|o)'),
4313
    ('A', '', 'ts$', '(a|o)'),
4314
    ('A', '[aoeOUiuQY]', '', '(a|o)'),
4315
    ('A', '', '', '(a|o|Y)'),
4316
    ('U', '', '$', 'u'),
4317
    ('U', '[DaoiuUQY]', '', 'u'),
4318
    ('U', '', '[^k]$', 'u'),
4319
    ('Uk', '[lr]', '$', '(uk|Qk)'),
4320
    ('Uk', '', '$', 'uk'),
4321
    ('sUts', '', '$', '(suts|sQts)'),
4322
    ('Uts', '', '$', 'uts'),
4323
    ('U', '', '', '(u|Q)'),
4324
)
4325
4326
# ash/approxhebrew.php
4327
4328 1
_ASH_APPROX_HEBREW = ()
4329
4330
# ash/approxhungarian.php
4331
4332
# this file uses the same rules as approxfrench.php
4333
4334
# ash/approxpolish.php
4335 1
_ASH_APPROX_POLISH = (
4336
    ('aiB', '', '[bp]', '(D|Dm)'),
4337
    ('oiB', '', '[bp]', '(D|Dm)'),
4338
    ('uiB', '', '[bp]', '(D|Dm)'),
4339
    ('eiB', '', '[bp]', '(D|Dm)'),
4340
    ('EiB', '', '[bp]', '(D|Dm)'),
4341
    ('iiB', '', '[bp]', '(D|Dm)'),
4342
    ('IiB', '', '[bp]', '(D|Dm)'),
4343
    ('aiB', '', '[dgkstvz]', '(D|Dn)'),
4344
    ('oiB', '', '[dgkstvz]', '(D|Dn)'),
4345
    ('uiB', '', '[dgkstvz]', '(D|Dn)'),
4346
    ('eiB', '', '[dgkstvz]', '(D|Dn)'),
4347
    ('EiB', '', '[dgkstvz]', '(D|Dn)'),
4348
    ('iiB', '', '[dgkstvz]', '(D|Dn)'),
4349
    ('IiB', '', '[dgkstvz]', '(D|Dn)'),
4350
    ('B', '', '[bp]', '(o|om|im)'),
4351
    ('B', '', '[dgkstvz]', '(o|on|in)'),
4352
    ('B', '', '', 'o'),
4353
    ('aiF', '', '[bp]', '(D|Dm)'),
4354
    ('oiF', '', '[bp]', '(D|Dm)'),
4355
    ('uiF', '', '[bp]', '(D|Dm)'),
4356
    ('eiF', '', '[bp]', '(D|Dm)'),
4357
    ('EiF', '', '[bp]', '(D|Dm)'),
4358
    ('iiF', '', '[bp]', '(D|Dm)'),
4359
    ('IiF', '', '[bp]', '(D|Dm)'),
4360
    ('aiF', '', '[dgkstvz]', '(D|Dn)'),
4361
    ('oiF', '', '[dgkstvz]', '(D|Dn)'),
4362
    ('uiF', '', '[dgkstvz]', '(D|Dn)'),
4363
    ('eiF', '', '[dgkstvz]', '(D|Dn)'),
4364
    ('EiF', '', '[dgkstvz]', '(D|Dn)'),
4365
    ('iiF', '', '[dgkstvz]', '(D|Dn)'),
4366
    ('IiF', '', '[dgkstvz]', '(D|Dn)'),
4367
    ('F', '', '[bp]', '(i|im|om)'),
4368
    ('F', '', '[dgkstvz]', '(i|in|on)'),
4369
    ('F', '', '', 'i'),
4370
    ('P', '', '', '(o|u)'),
4371
    ('I', '', '$', 'i'),
4372
    ('I', '', '[^k]$', 'i'),
4373
    ('Ik', '[lr]', '$', '(ik|Qk)'),
4374
    ('Ik', '', '$', 'ik'),
4375
    ('sIts', '', '$', '(sits|sQts)'),
4376
    ('Its', '', '$', 'its'),
4377
    ('I', '[aeiAEBFIou]', '', 'i'),
4378
    ('I', '', '', '(i|Q)'),
4379
    ('au', '', '', '(D|a|u)'),
4380
    ('ou', '', '', '(D|o|u)'),
4381
    ('ai', '', '', '(D|a|i)'),
4382
    ('oi', '', '', '(D|o|i)'),
4383
    ('ui', '', '', '(D|u|i)'),
4384
    ('a', '', '', '(a|o)'),
4385
    ('e', '', '', 'i'),
4386
    ('E', '', '[fklmnprst]$', 'i'),
4387
    ('E', '', 'ts$', 'i'),
4388
    ('E', '', '$', 'i'),
4389
    ('E', '[DaoiuQ]', '', 'i'),
4390
    ('E', '', '[aoQ]', 'i'),
4391
    ('E', '', '', '(Y|i)'),
4392
)
4393
4394
# ash/approxromanian.php
4395
4396
# this file uses the same rules as approxpolish.php
4397
4398
# ash/approxrussian.php
4399
4400 1
_ASH_APPROX_RUSSIAN = (
4401
    # VOWELS
4402
    ('I', '', '$', 'i'),
4403
    ('I', '', '[^k]$', 'i'),
4404
    ('Ik', '[lr]', '$', '(ik|Qk)'),
4405
    ('Ik', '', '$', 'ik'),
4406
    ('sIts', '', '$', '(sits|sQts)'),
4407
    ('Its', '', '$', 'its'),
4408
    ('I', '[aeiEIou]', '', 'i'),
4409
    ('I', '', '', '(i|Q)'),
4410
    ('au', '', '', '(D|a|u)'),
4411
    ('ou', '', '', '(D|o|u)'),
4412
    ('ai', '', '', '(D|a|i)'),
4413
    ('oi', '', '', '(D|o|i)'),
4414
    ('ui', '', '', '(D|u|i)'),
4415
    ('om', '', '[bp]', '(om|im)'),
4416
    ('on', '', '[dgkstvz]', '(on|in)'),
4417
    ('em', '', '[bp]', '(im|om)'),
4418
    ('en', '', '[dgkstvz]', '(in|on)'),
4419
    ('Em', '', '[bp]', '(im|Ym|om)'),
4420
    ('En', '', '[dgkstvz]', '(in|Yn|on)'),
4421
    ('a', '', '', '(a|o)'),
4422
    ('e', '', '', 'i'),
4423
    ('E', '', '[fklmnprsStv]$', 'i'),
4424
    ('E', '', 'ts$', 'i'),
4425
    ('E', '[DaoiuQ]', '', 'i'),
4426
    ('E', '', '[aoQ]', 'i'),
4427
    ('E', '', '', '(Y|i)'),
4428
)
4429
4430
# ash/approxspanish.php
4431
4432
# this file uses the same rules as approxfrench.php
4433
4434
# ash/exactany.php
4435
# These rules are applied after the word has been transliterated into the phonetic alphabet  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (105/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
4436
# These rules are substitution rules within the phonetic character space rather than mapping rules  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (112/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
4437
4438
# format of each entry rule in the table
4439
# (pattern, left context, right context, phonetic)
4440
# where
4441
# pattern is a sequence of characters that might appear after a word has been transliterated into phonetic alphabet  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (129/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
4442
# left context is the context that precedes the pattern
4443
# right context is the context that follows the pattern
4444
# phonetic is the result that this rule generates
4445
#
4446
# note that both left context and right context can be regular expressions
4447
# ex: left context of ^ would mean start of word
4448
# right context of $ means end of word
4449
#
4450
# match occurs if all of the following are true:
4451
# portion of word matches the pattern
4452
# that portion satisfies the context
4453
4454
# A, E, I, O, P, U should create variants, but a, e, i, o, u should not create any new variant  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (108/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
4455
# Q = ü ; Y = ä = ö
4456
4457 1
_ASH_EXACT_ANY = (
4458
    ('A', '', '', 'a'),
4459
    ('B', '', '', 'a'),
4460
    ('E', '', '', 'e'),
4461
    ('F', '', '', 'e'),
4462
    ('I', '', '', 'i'),
4463
    ('O', '', '', 'o'),
4464
    ('P', '', '', 'o'),
4465
    ('U', '', '', 'u'),
4466
    ('J', '', '', 'l'),
4467
)
4468
4469
# ash/exactapproxcommon.php
4470
# Ashkenazic
4471 1
_ASH_EXACT_APPROX_COMMON = (
4472
    ('h', '', '$', ''),
4473
    # VOICED - UNVOICED CONSONANTS
4474
    ('b', '', '[fktSs]', 'p'),
4475
    ('b', '', 'p', ''),
4476
    ('b', '', '$', 'p'),
4477
    ('p', '', '[gdZz]', 'b'),
4478
    ('p', '', 'b', ''),
4479
    ('v', '', '[pktSs]', 'f'),
4480
    ('v', '', 'f', ''),
4481
    ('v', '', '$', 'f'),
4482
    ('f', '', '[bgdZz]', 'v'),
4483
    ('f', '', 'v', ''),
4484
    ('g', '', '[pftSs]', 'k'),
4485
    ('g', '', 'k', ''),
4486
    ('g', '', '$', 'k'),
4487
    ('k', '', '[bdZz]', 'g'),
4488
    ('k', '', 'g', ''),
4489
    ('d', '', '[pfkSs]', 't'),
4490
    ('d', '', 't', ''),
4491
    ('d', '', '$', 't'),
4492
    ('t', '', '[bgZz]', 'd'),
4493
    ('t', '', 'd', ''),
4494
    ('s', '', 'dZ', ''),
4495
    ('s', '', 'tS', ''),
4496
    ('z', '', '[pfkSt]', 's'),
4497
    ('z', '', '[sSzZ]', ''),
4498
    ('s', '', '[sSzZ]', ''),
4499
    ('Z', '', '[sSzZ]', ''),
4500
    ('S', '', '[sSzZ]', ''),
4501
    # SIMPLIFICATION OF CONSONANT CLUSTERS
4502
    ('jnm', '', '', 'jm'),
4503
    # DOUBLE --> SINGLE
4504
    ('ji', '^', '', 'i'),
4505
    ('jI', '^', '', 'I'),
4506
    ('a', '', '[aAB]', ''),
4507
    ('a', '[AB]', '', ''),
4508
    ('A', '', 'A', ''),
4509
    ('B', '', 'B', ''),
4510
    ('b', '', 'b', ''),
4511
    ('d', '', 'd', ''),
4512
    ('f', '', 'f', ''),
4513
    ('g', '', 'g', ''),
4514
    ('k', '', 'k', ''),
4515
    ('l', '', 'l', ''),
4516
    ('m', '', 'm', ''),
4517
    ('n', '', 'n', ''),
4518
    ('p', '', 'p', ''),
4519
    ('r', '', 'r', ''),
4520
    ('t', '', 't', ''),
4521
    ('v', '', 'v', ''),
4522
    ('z', '', 'z', '')
4523
    # do not put name of file here since it always gets merged into another file  # noqa: E501
4524
)
4525
4526
# ash/exactcommon.php
4527
# Ashkenazic
4528
4529 1
_ASH_EXACT_COMMON = (
4530
    ('H', '', '', 'h'),
4531
    # VOICED - UNVOICED CONSONANTS
4532
    ('s', '[^t]', '[bgZd]', 'z'),
4533
    ('Z', '', '[pfkst]', 'S'),
4534
    ('Z', '', '$', 'S'),
4535
    ('S', '', '[bgzd]', 'Z'),
4536
    ('z', '', '$', 's'),
4537
    ('ji', '[aAoOeEiIuU]', '', 'j'),
4538
    ('jI', '[aAoOeEiIuU]', '', 'j'),
4539
    ('je', '[aAoOeEiIuU]', '', 'j'),
4540
    ('jE', '[aAoOeEiIuU]', '', 'j'),
4541
)
4542
4543
# ash/exactcyrillic.php
4544
# this file uses the same rules as exactrussian.php
4545
4546
# ash/exactenglish.php
4547
# this file uses the same rules as exactrussian.php
4548
4549
# ash/exactfrench.php
4550
# For Ashkenazic searches:
4551
# this file uses the same rules as exactrussian.php
4552
4553
# ash/exactgerman.php
4554
# this file uses the same rules as exactany.php
4555
4556
# ash/exacthebrew.php
4557 1
_ASH_EXACT_HEBREW = ()
4558
4559
# ash/exacthungarian.php
4560
# this file uses the same rules as exactrussian.php
4561
4562
# ash/exactpolish.php
4563 1
_ASH_EXACT_POLISH = (
4564
    ('B', '', '', 'a'),
4565
    ('F', '', '', 'e'),
4566
    ('P', '', '', 'o'),
4567
    ('E', '', '', 'e'),
4568
    ('I', '', '', 'i'),
4569
)
4570
4571
# ash/exactromanian.php
4572
# this file uses the same rules as exactrussian.php
4573
4574
# ash/exactrussian.php
4575 1
_ASH_EXACT_RUSSIAN = (('E', '', '', 'e'), ('I', '', '', 'i'))
4576
4577
# ash/exactspanish.php
4578
# this Ashkenazic file uses the same rules as exactrussian.php
4579
4580
# ash/hebrewcommon.php
4581
# Ashkenazic
4582
4583 1
_ASH_HEBREW_COMMON = (
4584
    ('ts', '', '', 'C'),  # for not confusion Gutes [=guts] and Guts [=guc]
4585
    ('tS', '', '', 'C'),  # same reason
4586
    ('S', '', '', 's'),
4587
    ('p', '', '', 'f'),
4588
    ('b', '^', '', 'b'),
4589
    ('b', '', '', '(b|v)'),
4590
    ('J', '', '', 'l'),
4591
    ('ja', '', '', 'i'),
4592
    ('jA', '', '', 'i'),
4593
    ('jB', '', '', 'i'),
4594
    ('je', '', '', 'i'),
4595
    ('jE', '', '', 'i'),
4596
    ('jF', '', '', 'i'),
4597
    ('aj', '', '', 'i'),
4598
    ('Aj', '', '', 'i'),
4599
    ('Bj', '', '', 'i'),
4600
    ('Fj', '', '', 'i'),
4601
    ('I', '', '', 'i'),
4602
    ('Q', '', '', 'i'),
4603
    ('j', '', '', 'i'),
4604
    ('a', '^', '', '1'),
4605
    ('A', '^', '', '1'),
4606
    ('B', '^', '', '1'),
4607
    ('e', '^', '', '1'),
4608
    ('E', '^', '', '1'),
4609
    ('F', '^', '', '1'),
4610
    ('Y', '^', '', '1'),
4611
    ('a', '', '$', '1'),
4612
    ('A', '', '$', '1'),
4613
    ('B', '', '$', '1'),
4614
    ('e', '', '$', '1'),
4615
    ('E', '', '$', '1'),
4616
    ('F', '', '$', '1'),
4617
    ('Y', '', '$', '1'),
4618
    ('a', '', '', ''),
4619
    ('A', '', '', ''),
4620
    ('B', '', '', ''),
4621
    ('e', '', '', ''),
4622
    ('E', '', '', ''),
4623
    ('F', '', '', ''),
4624
    ('Y', '', '', ''),
4625
    ('oj', '^', '', '(u|vi)'),
4626
    ('Oj', '^', '', '(u|vi)'),
4627
    ('uj', '^', '', '(u|vi)'),
4628
    ('Uj', '^', '', '(u|vi)'),
4629
    ('oj', '', '', 'u'),
4630
    ('Oj', '', '', 'u'),
4631
    ('uj', '', '', 'u'),
4632
    ('Uj', '', '', 'u'),
4633
    ('ou', '^', '', '(u|v|1)'),
4634
    ('o', '^', '', '(u|v|1)'),
4635
    ('O', '^', '', '(u|v|1)'),
4636
    ('P', '^', '', '(u|v|1)'),
4637
    ('U', '^', '', '(u|v|1)'),
4638
    ('u', '^', '', '(u|v|1)'),
4639
    ('o', '', '$', '(u|1)'),
4640
    ('O', '', '$', '(u|1)'),
4641
    ('P', '', '$', '(u|1)'),
4642
    ('u', '', '$', '(u|1)'),
4643
    ('U', '', '$', '(u|1)'),
4644
    ('ou', '', '', 'u'),
4645
    ('o', '', '', 'u'),
4646
    ('O', '', '', 'u'),
4647
    ('P', '', '', 'u'),
4648
    ('U', '', '', 'u'),
4649
    ('VV', '', '', 'u'),  # alef/ayin + vov from ruleshebrew
4650
    ('V', '', '', 'v'),  # tsvey-vov from ruleshebrew;; only Ashkenazic
4651
    ('L', '^', '', '1'),  # alef/ayin from ruleshebrew
4652
    ('L', '', '$', '1'),  # alef/ayin from ruleshebrew
4653
    ('L', '', '', ''),  # alef/ayin from ruleshebrew
4654
    ('WW', '^', '', '(vi|u)'),  # vav-yod from ruleshebrew
4655
    ('WW', '', '', 'u'),  # vav-yod from ruleshebrew
4656
    ('W', '^', '', '(u|v)'),  # vav from ruleshebrew
4657
    ('W', '', '', 'u'),  # vav from ruleshebrew
4658
    # ("g","","","(g|Z)"),
4659
    # ("z","","","(z|Z)"),
4660
    # ("d","","","(d|dZ)"),
4661
    ('TB', '^', '', 't'),  # tav from ruleshebrew; only Ashkenazic
4662
    ('TB', '', '$', 's'),  # tav from ruleshebrew; only Ashkenazic
4663
    ('TB', '', '', '(t|s)'),  # tav from ruleshebrew; only Ashkenazic
4664
    ('T', '', '', 't'),  # tet from ruleshebrew
4665
    # ("k","","","(k|x)"),
4666
    # ("x","","","(k|x)"),
4667
    ('K', '', '', 'k'),  # kof and initial kaf from ruleshebrew
4668
    ('X', '', '', 'x'),  # khet and final kaf from ruleshebrew
4669
    ('H', '^', '', '(x|1)'),
4670
    ('H', '', '$', '(x|1)'),
4671
    ('H', '', '', '(x|)'),
4672
    ('h', '^', '', '1'),
4673
    ('h', '', '', ''),
4674
)
4675
4676
# ash/lang.php
4677
# ASHKENAZIC
4678
4679
# format of entries in $languageRules table is
4680
# (pattern, language, Acceptance)
4681
# where
4682
# pattern is a regular expression
4683
# e.g., ^ means start of word, $ Means End Of Word, [^ei] means anything but e or i, etc.  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (103/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
4684
# language is one or more of the languages defined above separated by + signs
4685
# acceptance is true or false
4686
# meaning is:
4687
# if "pattern" matches and acceptance is true, name is in one of the languages indicated and no others  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (116/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
4688
# if "pattern" matches and acceptance is false, name is not in any of the languages indicated  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (107/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
4689
4690 1
_ASH_LANGUAGE_RULES = (
4691
    # 1. following are rules to accept the language
4692
    # 1.1 Special letter combinations
4693
    ('zh', 147616, True),
4694
    ('eau', 64, True),
4695
    ('[aoeiuäöü]h', 128, True),
4696
    ('^vogel', 128, True),
4697
    ('vogel$', 128, True),
4698
    ('witz', 128, True),
4699
    ('tz$', 131232, True),
4700
    ('^tz', 131104, True),
4701
    ('güe', 262144, True),
4702
    ('güi', 262144, True),
4703
    ('ghe', 65536, True),
4704
    ('ghi', 65536, True),
4705
    ('vici$', 65536, True),
4706
    ('schi$', 65536, True),
4707
    ('chsch', 128, True),
4708
    ('tsch', 128, True),
4709
    ('ssch', 128, True),
4710
    ('sch$', 131200, True),
4711
    ('^sch', 131200, True),
4712
    ('^rz', 16384, True),
4713
    ('rz$', 16512, True),
4714
    ('[^aoeiuäöü]rz', 16384, True),
4715
    ('rz[^aoeiuäöü]', 16384, True),
4716
    ('cki$', 16384, True),
4717
    ('ska$', 16384, True),
4718
    ('cka$', 16384, True),
4719
    ('ue', 131200, True),
4720
    ('ae', 131232, True),
4721
    ('oe', 131296, True),
4722
    ('th$', 128, True),
4723
    ('^th', 128, True),
4724
    ('th[^aoeiu]', 128, True),
4725
    ('mann', 128, True),
4726
    ('cz', 16384, True),
4727
    ('cy', 16384, True),
4728
    ('niew', 16384, True),
4729
    ('stein', 128, True),
4730
    ('heim$', 128, True),
4731
    ('heimer$', 128, True),
4732
    ('ii$', 131072, True),
4733
    ('iy$', 131072, True),
4734
    ('yy$', 131072, True),
4735
    ('yi$', 131072, True),
4736
    ('yj$', 131072, True),
4737
    ('ij$', 131072, True),
4738
    ('gaus$', 131072, True),
4739
    ('gauz$', 131072, True),
4740
    ('gauz$', 131072, True),
4741
    ('goltz$', 131072, True),
4742
    ('gol\'tz$', 131072, True),
4743
    ('golts$', 131072, True),
4744
    ('gol\'ts$', 131072, True),
4745
    ('^goltz', 131072, True),
4746
    ('^gol\'tz', 131072, True),
4747
    ('^golts', 131072, True),
4748
    ('^gol\'ts', 131072, True),
4749
    ('gendler$', 131072, True),
4750
    ('gejmer$', 131072, True),
4751
    ('gejm$', 131072, True),
4752
    ('geimer$', 131072, True),
4753
    ('geim$', 131072, True),
4754
    ('geymer', 131072, True),
4755
    ('geym$', 131072, True),
4756
    ('gof$', 131072, True),
4757
    ('thal', 128, True),
4758
    ('zweig', 128, True),
4759
    ('ck$', 160, True),
4760
    ('c$', 83968, True),
4761
    ('sz', 18432, True),
4762
    ('gue', 262208, True),
4763
    ('gui', 262208, True),
4764
    ('guy', 64, True),
4765
    ('cs$', 2048, True),
4766
    ('^cs', 2048, True),
4767
    ('dzs', 2048, True),
4768
    ('zs$', 2048, True),
4769
    ('^zs', 2048, True),
4770
    ('^wl', 16384, True),
4771
    ('^wr', 16544, True),
4772
    ('gy$', 2048, True),
4773
    ('gy[aeou]', 2048, True),
4774
    ('gy', 133120, True),
4775
    ('ly', 149504, True),
4776
    ('ny', 149504, True),
4777
    ('ty', 149504, True),
4778
    # 1.2 special characters
4779
    ('â', 65600, True),
4780
    ('ă', 65536, True),
4781
    ('à', 64, True),
4782
    ('ä', 128, True),
4783
    ('á', 264192, True),
4784
    ('ą', 16384, True),
4785
    ('ć', 16384, True),
4786
    ('ç', 64, True),
4787
    ('ę', 16384, True),
4788
    ('é', 264256, True),
4789
    ('è', 64, True),
4790
    ('ê', 64, True),
4791
    ('í', 264192, True),
4792
    ('î', 65600, True),
4793
    ('ł', 16384, True),
4794
    ('ń', 16384, True),
4795
    ('ñ', 262144, True),
4796
    ('ó', 280576, True),
4797
    ('ö', 2176, True),
4798
    ('õ', 2048, True),
4799
    ('ş', 65536, True),
4800
    ('ś', 16384, True),
4801
    ('ţ', 65536, True),
4802
    ('ü', 2176, True),
4803
    ('ù', 64, True),
4804
    ('ű', 2048, True),
4805
    ('ú', 264192, True),
4806
    ('ź', 16384, True),
4807
    ('ż', 16384, True),
4808
    ('ß', 128, True),
4809
    # Every Cyrillic word has at least one Cyrillic vowel (аёеоиуыэюя)
4810
    ('а', 4, True),
4811
    ('ё', 4, True),
4812
    ('о', 4, True),
4813
    ('е', 4, True),
4814
    ('и', 4, True),
4815
    ('у', 4, True),
4816
    ('ы', 4, True),
4817
    ('э', 4, True),
4818
    ('ю', 4, True),
4819
    ('я', 4, True),
4820
    # Hebrew
4821
    ('א', 1024, True),
4822
    ('ב', 1024, True),
4823
    ('ג', 1024, True),
4824
    ('ד', 1024, True),
4825
    ('ה', 1024, True),
4826
    ('ו', 1024, True),
4827
    ('ז', 1024, True),
4828
    ('ח', 1024, True),
4829
    ('ט', 1024, True),
4830
    ('י', 1024, True),
4831
    ('כ', 1024, True),
4832
    ('ל', 1024, True),
4833
    ('מ', 1024, True),
4834
    ('נ', 1024, True),
4835
    ('ס', 1024, True),
4836
    ('ע', 1024, True),
4837
    ('פ', 1024, True),
4838
    ('צ', 1024, True),
4839
    ('ק', 1024, True),
4840
    ('ר', 1024, True),
4841
    ('ש', 1024, True),
4842
    ('ת', 1024, True),
4843
    # 2. following are rules to reject the language
4844
    # Every Latin character word has at least one Latin vowel
4845
    ('a', 1028, False),
4846
    ('o', 1028, False),
4847
    ('e', 1028, False),
4848
    ('i', 1028, False),
4849
    ('y', 66564, False),
4850
    ('u', 1028, False),
4851
    (
4852
        'v[^aoeiuäüö]',
4853
        128,
4854
        False,
4855
    ),  # in german, "v" can be found before a vowel only  # noqa: E501
4856
    (
4857
        'y[^aoeiu]',
4858
        128,
4859
        False,
4860
    ),  # in german, "y" usually appears only in the last position; sometimes before a vowel  # noqa: E501
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (106/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
4861
    ('c[^aohk]', 128, False),
4862
    ('dzi', 224, False),
4863
    ('ou', 128, False),
4864
    ('aj', 224, False),
4865
    ('ej', 224, False),
4866
    ('oj', 224, False),
4867
    ('uj', 224, False),
4868
    ('k', 65536, False),
4869
    ('v', 16384, False),
4870
    ('ky', 16384, False),
4871
    ('eu', 147456, False),
4872
    ('w', 460864, False),
4873
    ('kie', 262208, False),
4874
    ('gie', 327744, False),
4875
    ('q', 215040, False),
4876
    ('sch', 280640, False),
4877
    ('^h', 131072, False),
4878
)
4879
4880
# ash/languagenames.php
4881 1
_ASH_LANGUAGES = (
4882
    'any',
4883
    'cyrillic',
4884
    'english',
4885
    'french',
4886
    'german',
4887
    'hebrew',
4888
    'hungarian',
4889
    'polish',
4890
    'romanian',
4891
    'russian',
4892
    'spanish',
4893
)
4894
4895
# ash/rulesany.php
4896
# ASHKENAZIC
4897 1
_ASH_RULES_ANY = (
4898
    # CONVERTING FEMININE TO MASCULINE
4899
    ('yna', '', '$', '(in[131072]|ina)'),
4900
    ('ina', '', '$', '(in[131072]|ina)'),
4901
    ('liova', '', '$', '(lof[131072]|lef[131072]|lova)'),
4902
    ('lova', '', '$', '(lof[131072]|lef[131072]|lova)'),
4903
    ('ova', '', '$', '(of[131072]|ova)'),
4904
    ('eva', '', '$', '(ef[131072]|eva)'),
4905
    ('aia', '', '$', '(aja|i[131072])'),
4906
    ('aja', '', '$', '(aja|i[131072])'),
4907
    ('aya', '', '$', '(aja|i[131072])'),
4908
    ('lowa', '', '$', '(lova|lof[16384]|l[16384]|el[16384])'),
4909
    ('kowa', '', '$', '(kova|kof[16384]|k[16384]|ek[16384])'),
4910
    ('owa', '', '$', '(ova|of[16384]|)'),
4911
    ('lowna', '', '$', '(lovna|levna|l[16384]|el[16384])'),
4912
    ('kowna', '', '$', '(kovna|k[16384]|ek[16384])'),
4913
    ('owna', '', '$', '(ovna|[16384])'),
4914
    ('lówna', '', '$', '(l|el[16384])'),  # polish
4915
    ('kówna', '', '$', '(k|ek[16384])'),  # polish
4916
    ('ówna', '', '$', ''),  # polish
4917
    ('a', '', '$', '(a|i[16384])'),
4918
    # CONSONANTS  (integrated: German, Polish, Russian, Romanian and English)
4919
    ('rh', '^', '', 'r'),
4920
    ('ssch', '', '', 'S'),
4921
    ('chsch', '', '', 'xS'),
4922
    ('tsch', '', '', 'tS'),
4923
    ('sch', '', '[ei]', '(sk[65536]|S|StS[131072])'),  # german
4924
    ('sch', '', '', '(S|StS[131072])'),  # german
4925
    ('ssh', '', '', 'S'),
4926
    ('sh', '', '[äöü]', 'sh'),  # german
4927
    ('sh', '', '[aeiou]', '(S[131104]|sh)'),
4928
    ('sh', '', '', 'S'),  # russian+english
4929
    ('kh', '', '', '(x[131104]|kh)'),
4930
    ('chs', '', '', '(ks[128]|xs|tSs[131104])'),
4931
    # French "ch" is currently disabled
4932
    # ("ch","","[ei]","(x|tS|k[65536]|S[64])"),
4933
    # ("ch","","","(x|tS[131104]|S[64])"),
4934
    ('ch', '', '[ei]', '(x|k[65536]|tS[131104])'),
4935
    ('ch', '', '', '(x|tS[131104])'),
4936
    ('ck', '', '', '(k|tsk[16384])'),
4937
    ('czy', '', '', 'tSi'),
4938
    ('cze', '', '[bcdgkpstwzż]', '(tSe|tSF)'),
4939
    ('ciewicz', '', '', '(tsevitS|tSevitS)'),
4940
    ('siewicz', '', '', '(sevitS|SevitS)'),
4941
    ('ziewicz', '', '', '(zevitS|ZevitS)'),
4942
    ('riewicz', '', '', 'rjevitS'),
4943
    ('diewicz', '', '', 'djevitS'),
4944
    ('tiewicz', '', '', 'tjevitS'),
4945
    ('iewicz', '', '', 'evitS'),
4946
    ('ewicz', '', '', 'evitS'),
4947
    ('owicz', '', '', 'ovitS'),
4948
    ('icz', '', '', 'itS'),
4949
    ('cz', '', '', 'tS'),  # Polish
4950
    ('cia', '', '[bcdgkpstwzż]', '(tSB[16384]|tsB)'),
4951
    ('cia', '', '', '(tSa[16384]|tsa)'),
4952
    ('cią', '', '[bp]', '(tSom[16384]|tsom)'),
4953
    ('cią', '', '', '(tSon[16384]|tson)'),
4954
    ('cię', '', '[bp]', '(tSem[16384]|tsem)'),
4955
    ('cię', '', '', '(tSen[16384]|tsen)'),
4956
    ('cie', '', '[bcdgkpstwzż]', '(tSF[16384]|tsF)'),
4957
    ('cie', '', '', '(tSe[16384]|tse)'),
4958
    ('cio', '', '', '(tSo[16384]|tso)'),
4959
    ('ciu', '', '', '(tSu[16384]|tsu)'),
4960
    ('ci', '', '$', '(tsi[16384]|tSi[81920]|tS[65536]|si)'),
4961
    ('ci', '', '', '(tsi[16384]|tSi[81920]|si)'),
4962
    ('ce', '', '[bcdgkpstwzż]', '(tsF[16384]|tSe[81920]|se)'),
4963
    ('ce', '', '', '(tSe[81920]|tse[16384]|se)'),
4964
    ('cy', '', '', '(si|tsi[16384])'),
4965
    ('ssz', '', '', 'S'),  # Polish
4966
    (
4967
        'sz',
4968
        '',
4969
        '',
4970
        'S',
4971
    ),  # Polish; actually could also be Hungarian /s/, disabled here  # noqa: E501
4972
    ('ssp', '', '', '(Sp[128]|sp)'),
4973
    ('sp', '', '', '(Sp[128]|sp)'),
4974
    ('sst', '', '', '(St[128]|st)'),
4975
    ('st', '', '', '(St[128]|st)'),
4976
    ('ss', '', '', 's'),
4977
    ('sia', '', '[bcdgkpstwzż]', '(SB[16384]|sB[16384]|sja)'),
4978
    ('sia', '', '', '(Sa[16384]|sja)'),
4979
    ('sią', '', '[bp]', '(Som[16384]|som)'),
4980
    ('sią', '', '', '(Son[16384]|son)'),
4981
    ('się', '', '[bp]', '(Sem[16384]|sem)'),
4982
    ('się', '', '', '(Sen[16384]|sen)'),
4983
    ('sie', '', '[bcdgkpstwzż]', '(SF[16384]|sF|zi[128])'),
4984
    ('sie', '', '', '(se|Se[16384]|zi[128])'),
4985
    ('sio', '', '', '(So[16384]|so)'),
4986
    ('siu', '', '', '(Su[16384]|sju)'),
4987
    ('si', '', '', '(Si[16384]|si|zi[128])'),
4988
    ('s', '', '[aeiouäöü]', '(s|z[128])'),
4989
    ('gue', '', '', 'ge'),
4990
    ('gui', '', '', 'gi'),
4991
    ('guy', '', '', 'gi'),
4992
    ('gh', '', '[ei]', '(g[65536]|gh)'),
4993
    ('gauz', '', '$', 'haus'),
4994
    ('gaus', '', '$', 'haus'),
4995
    ('gol\'ts', '', '$', 'holts'),
4996
    ('golts', '', '$', 'holts'),
4997
    ('gol\'tz', '', '$', 'holts'),
4998
    ('goltz', '', '', 'holts'),
4999
    ('gol\'ts', '^', '', 'holts'),
5000
    ('golts', '^', '', 'holts'),
5001
    ('gol\'tz', '^', '', 'holts'),
5002
    ('goltz', '^', '', 'holts'),
5003
    ('gendler', '', '$', 'hendler'),
5004
    ('gejmer', '', '$', 'hajmer'),
5005
    ('gejm', '', '$', 'hajm'),
5006
    ('geymer', '', '$', 'hajmer'),
5007
    ('geym', '', '$', 'hajm'),
5008
    ('geimer', '', '$', 'hajmer'),
5009
    ('geim', '', '$', 'hajm'),
5010
    ('gof', '', '$', 'hof'),
5011
    ('ger', '', '$', 'ger'),
5012
    ('gen', '', '$', 'gen'),
5013
    ('gin', '', '$', 'gin'),
5014
    ('gie', '', '$', '(ge|gi[128]|ji[64])'),
5015
    ('gie', '', '', 'ge'),
5016
    ('ge', '[yaeiou]', '', '(gE|xe[262144]|dZe[65568])'),
5017
    ('gi', '[yaeiou]', '', '(gI|xi[262144]|dZi[65568])'),
5018
    ('ge', '', '', '(gE|dZe[65568]|hE[131072]|xe[262144])'),
5019
    ('gi', '', '', '(gI|dZi[65568]|hI[131072]|xi[262144])'),
5020
    ('gy', '', '[aeouáéóúüöőű]', '(gi|dj[2048])'),
5021
    ('gy', '', '', '(gi|d[2048])'),
5022
    ('g', '[jyaeiou]', '[aouyei]', 'g'),
5023
    ('g', '', '[aouei]', '(g|h[131072])'),
5024
    ('ej', '', '', '(aj|eZ[65600]|ex[262144])'),
5025
    ('ej', '', '', 'aj'),
5026
    ('ly', '', '[au]', 'l'),
5027
    ('li', '', '[au]', 'l'),
5028
    ('lj', '', '[au]', 'l'),
5029
    ('lio', '', '', '(lo|le[131072])'),
5030
    ('lyo', '', '', '(lo|le[131072])'),
5031
    ('ll', '', '', '(l|J[262144])'),
5032
    ('j', '', '[aoeiuy]', '(j|dZ[32]|x[262144]|Z[65600])'),
5033
    ('j', '', '', '(j|x[262144])'),
5034
    ('pf', '', '', '(pf|p|f)'),
5035
    ('ph', '', '', '(ph|f)'),
5036
    ('qu', '', '', '(kv[128]|k)'),
5037
    ('rze', 't', '', '(Se[16384]|re)'),  # polish
5038
    ('rze', '', '', '(rze|rtsE[128]|Ze[16384]|re[16384]|rZe[16384])'),
5039
    ('rzy', 't', '', '(Si[16384]|ri)'),  # polish
5040
    ('rzy', '', '', '(Zi[16384]|ri[16384]|rZi)'),
5041
    ('rz', 't', '', '(S[16384]|r)'),  # polish
5042
    ('rz', '', '', '(rz|rts[128]|Z[16384]|r[16384]|rZ[16384])'),  # polish
5043
    ('tz', '', '$', '(ts|tS[160])'),
5044
    ('tz', '^', '', '(ts|tS[160])'),
5045
    ('tz', '', '', '(ts[131232]|tz)'),
5046
    ('zh', '', '', '(Z|zh[16384]|tsh[128])'),
5047
    ('zia', '', '[bcdgkpstwzż]', '(ZB[16384]|zB[16384]|zja)'),
5048
    ('zia', '', '', '(Za[16384]|zja)'),
5049
    ('zią', '', '[bp]', '(Zom[16384]|zom)'),
5050
    ('zią', '', '', '(Zon[16384]|zon)'),
5051
    ('zię', '', '[bp]', '(Zem[16384]|zem)'),
5052
    ('zię', '', '', '(Zen[16384]|zen)'),
5053
    ('zie', '', '[bcdgkpstwzż]', '(ZF[16384]|zF[16384]|ze|tsi[128])'),
5054
    ('zie', '', '', '(ze|Ze[16384]|tsi[128])'),
5055
    ('zio', '', '', '(Zo[16384]|zo)'),
5056
    ('ziu', '', '', '(Zu[16384]|zju)'),
5057
    ('zi', '', '', '(Zi[16384]|zi|tsi[128])'),
5058
    ('thal', '', '$', 'tal'),
5059
    ('th', '^', '', 't'),
5060
    ('th', '', '[aeiou]', '(t[128]|th)'),
5061
    ('th', '', '', 't'),  # german
5062
    ('vogel', '', '', '(vogel|fogel[128])'),
5063
    ('v', '^', '', '(v|f[128])'),
5064
    ('h', '[aeiouyäöü]', '', ''),  # german
5065
    ('h', '', '', '(h|x[81920])'),
5066
    ('h', '^', '', '(h|H[160])'),  # H can be exact "h" or approximate "kh"
5067
    # VOWELS
5068
    ('yi', ' ', '', 'i'),
5069
    ('ii', '', ' ', 'i'),
5070
    ('iy', '', ' ', 'i'),
5071
    ('yy', '', ' ', 'i'),
5072
    # ("e","","$","(e|)"),  # French & English rule disabled except for final -ine  # noqa: E501
5073
    ('e', 'in', '$', '(e|[64])'),
5074
    ('yj', '', '$', 'i'),  # russian
5075
    ('ij', '', '$', 'i'),  # russian
5076
    ('aue', '', '', 'aue'),
5077
    ('oue', '', '', 'oue'),
5078
    ('au', '', '', '(au|o[64])'),
5079
    ('ou', '', '', '(ou|u[64])'),
5080
    ('ue', '', '', '(Q|uje[131072])'),
5081
    ('ae', '', '', '(Y[128]|aje[131072]|ae)'),
5082
    ('oe', '', '', '(Y[128]|oje[131072]|oe)'),
5083
    ('ee', '', '', '(i[32]|aje[131072]|e)'),
5084
    ('ei', '', '', 'aj'),
5085
    ('ey', '', '', 'aj'),
5086
    ('eu', '', '', '(aj[128]|oj[128]|eu)'),
5087
    ('i', '[aou]', '', 'j'),
5088
    ('y', '[aou]', '', 'j'),
5089
    ('ie', '', '[bcdgkpstwzż]', '(i[128]|e[16384]|ije[131072]|je)'),
5090
    ('ie', '', '', '(i[128]|e[16384]|ije[131072]|je)'),
5091
    ('ye', '', '', '(je|ije[131072])'),
5092
    ('i', '', '[au]', 'j'),
5093
    ('y', '', '[au]', 'j'),
5094
    ('io', '', '', '(jo|e[131072])'),
5095
    ('yo', '', '', '(jo|e[131072])'),
5096
    ('ea', '', '', '(ea|ja[65536])'),
5097
    ('e', '^', '', '(e|je[131072])'),
5098
    ('oo', '', '', '(u[32]|o)'),
5099
    ('uu', '', '', 'u'),
5100
    # LANGUAGE SPECIFIC CHARACTERS
5101
    ('ć', '', '', '(tS[16384]|ts)'),  # polish
5102
    ('ł', '', '', 'l'),  # polish
5103
    ('ń', '', '', 'n'),  # polish
5104
    ('ñ', '', '', '(n|nj[262144])'),
5105
    ('ś', '', '', '(S[16384]|s)'),  # polish
5106
    ('ş', '', '', 'S'),  # romanian
5107
    ('ţ', '', '', 'ts'),  # romanian
5108
    ('ż', '', '', 'Z'),  # polish
5109
    ('ź', '', '', '(Z[16384]|z)'),  # polish
5110
    ('où', '', '', 'u'),  # french
5111
    ('ą', '', '[bp]', 'om'),  # polish
5112
    ('ą', '', '', 'on'),  # polish
5113
    ('ä', '', '', '(Y|e)'),  # german
5114
    ('á', '', '', 'a'),  # hungarian
5115
    ('ă', '', '', '(e[65536]|a)'),  # romanian
5116
    ('à', '', '', 'a'),  # french
5117
    ('â', '', '', 'a'),  # french+romanian
5118
    ('é', '', '', 'e'),
5119
    ('è', '', '', 'e'),  # french
5120
    ('ê', '', '', 'e'),  # french
5121
    ('ę', '', '[bp]', 'em'),  # polish
5122
    ('ę', '', '', 'en'),  # polish
5123
    ('í', '', '', 'i'),
5124
    ('î', '', '', 'i'),
5125
    ('ö', '', '', 'Y'),
5126
    ('ő', '', '', 'Y'),  # hungarian
5127
    ('ó', '', '', '(u[16384]|o)'),
5128
    ('ű', '', '', 'Q'),
5129
    ('ü', '', '', 'Q'),
5130
    ('ú', '', '', 'u'),
5131
    ('ű', '', '', 'Q'),  # hungarian
5132
    ('ß', '', '', 's'),  # german
5133
    ('\'', '', '', ''),
5134
    ('"', '', '', ''),
5135
    ('a', '', '[bcdgkpstwzż]', '(A|B[16384])'),
5136
    ('e', '', '[bcdgkpstwzż]', '(E|F[16384])'),
5137
    ('o', '', '[bcćdgklłmnńrsśtwzźż]', '(O|P[16384])'),
5138
    # LATIN ALPHABET
5139
    ('a', '', '', 'A'),
5140
    ('b', '', '', 'b'),
5141
    ('c', '', '', '(k|ts[16384])'),
5142
    ('d', '', '', 'd'),
5143
    ('e', '', '', 'E'),
5144
    ('f', '', '', 'f'),
5145
    ('g', '', '', 'g'),
5146
    ('h', '', '', 'h'),
5147
    ('i', '', '', 'I'),
5148
    ('j', '', '', 'j'),
5149
    ('k', '', '', 'k'),
5150
    ('l', '', '', 'l'),
5151
    ('m', '', '', 'm'),
5152
    ('n', '', '', 'n'),
5153
    ('o', '', '', 'O'),
5154
    ('p', '', '', 'p'),
5155
    ('q', '', '', 'k'),
5156
    ('r', '', '', 'r'),
5157
    ('s', '', '', 's'),
5158
    ('t', '', '', 't'),
5159
    ('u', '', '', 'U'),
5160
    ('v', '', '', 'v'),
5161
    ('w', '', '', 'v'),  # English disabled
5162
    ('x', '', '', 'ks'),
5163
    ('y', '', '', 'i'),
5164
    ('z', '', '', '(ts[128]|z)'),
5165
)
5166
5167
# ash/rulescyrillic.php
5168
5169 1
_ASH_RULES_CYRILLIC = (
5170
    ('ця', '', '', 'tsa'),
5171
    ('цю', '', '', 'tsu'),
5172
    ('циа', '', '', 'tsa'),
5173
    ('цие', '', '', 'tse'),
5174
    ('цио', '', '', 'tso'),
5175
    ('циу', '', '', 'tsu'),
5176
    ('сие', '', '', 'se'),
5177
    ('сио', '', '', 'so'),
5178
    ('зие', '', '', 'ze'),
5179
    ('зио', '', '', 'zo'),
5180
    ('гауз', '', '$', 'haus'),
5181
    ('гаус', '', '$', 'haus'),
5182
    ('гольц', '', '$', 'holts'),
5183
    ('геймер', '', '$', 'hajmer'),
5184
    ('гейм', '', '$', 'hajm'),
5185
    ('гоф', '', '$', 'hof'),
5186
    ('гер', '', '$', 'ger'),
5187
    ('ген', '', '$', 'gen'),
5188
    ('гин', '', '$', 'gin'),
5189
    ('г', '(й|ё|я|ю|ы|а|е|о|и|у)', '(а|е|о|и|у)', 'g'),
5190
    ('г', '', '(а|е|о|и|у)', '(g|h)'),
5191
    ('ля', '', '', 'la'),
5192
    ('лю', '', '', 'lu'),
5193
    ('лё', '', '', '(le|lo)'),
5194
    ('лио', '', '', '(le|lo)'),
5195
    ('ле', '', '', '(lE|lo)'),
5196
    ('ийе', '', '', 'je'),
5197
    ('ие', '', '', 'je'),
5198
    ('ыйе', '', '', 'je'),
5199
    ('ые', '', '', 'je'),
5200
    ('ий', '', '(а|о|у)', 'j'),
5201
    ('ый', '', '(а|о|у)', 'j'),
5202
    ('ий', '', '$', 'i'),
5203
    ('ый', '', '$', 'i'),
5204
    ('ё', '', '', '(e|jo)'),
5205
    ('ей', '^', '', '(jaj|aj)'),
5206
    ('е', '(а|е|о|у)', '', 'je'),
5207
    ('е', '^', '', 'je'),
5208
    ('эй', '', '', 'aj'),
5209
    ('ей', '', '', 'aj'),
5210
    ('ауе', '', '', 'aue'),
5211
    ('ауэ', '', '', 'aue'),
5212
    ('а', '', '', 'a'),
5213
    ('б', '', '', 'b'),
5214
    ('в', '', '', 'v'),
5215
    ('г', '', '', 'g'),
5216
    ('д', '', '', 'd'),
5217
    ('е', '', '', 'E'),
5218
    ('ж', '', '', 'Z'),
5219
    ('з', '', '', 'z'),
5220
    ('и', '', '', 'I'),
5221
    ('й', '', '', 'j'),
5222
    ('к', '', '', 'k'),
5223
    ('л', '', '', 'l'),
5224
    ('м', '', '', 'm'),
5225
    ('н', '', '', 'n'),
5226
    ('о', '', '', 'o'),
5227
    ('п', '', '', 'p'),
5228
    ('р', '', '', 'r'),
5229
    ('с', '', 'с', ''),
5230
    ('с', '', '', 's'),
5231
    ('т', '', '', 't'),
5232
    ('у', '', '', 'u'),
5233
    ('ф', '', '', 'f'),
5234
    ('х', '', '', 'x'),
5235
    ('ц', '', '', 'ts'),
5236
    ('ч', '', '', 'tS'),
5237
    ('ш', '', '', 'S'),
5238
    ('щ', '', '', 'StS'),
5239
    ('ъ', '', '', ''),
5240
    ('ы', '', '', 'I'),
5241
    ('ь', '', '', ''),
5242
    ('э', '', '', 'E'),
5243
    ('ю', '', '', 'ju'),
5244
    ('я', '', '', 'ja'),
5245
)
5246
5247
# ash/rulesenglish.php
5248
5249 1
_ASH_RULES_ENGLISH = (
5250
    # CONSONANTS
5251
    ('tch', '', '', 'tS'),
5252
    ('ch', '', '', '(tS|x)'),
5253
    ('ck', '', '', 'k'),
5254
    ('cc', '', '[iey]', 'ks'),  # success, accent
5255
    ('c', '', 'c', ''),
5256
    ('c', '', '[iey]', 's'),  # circle
5257
    ('c', '', '', 'k'),  # candy
5258
    ('gh', '^', '', 'g'),  # ghost
5259
    ('gh', '', '', '(g|f|w)'),  # burgh | tough | bough
5260
    ('gn', '', '', '(gn|n)'),
5261
    ('g', '', '[iey]', '(g|dZ)'),  # get, gem, giant, gigabyte
5262
    # ("th","","","(6|8|t)"),
5263
    ('th', '', '', 't'),
5264
    ('kh', '', '', 'x'),
5265
    ('ph', '', '', 'f'),
5266
    ('sch', '', '', '(S|sk)'),
5267
    ('sh', '', '', 'S'),
5268
    ('who', '^', '', 'hu'),
5269
    ('wh', '^', '', 'w'),
5270
    ('h', '', '$', ''),  # hard to find an example that isn't in a name
5271
    ('h', '', '[^aeiou]', ''),  # hard to find an example that isn't in a name
5272
    ('h', '^', '', 'H'),
5273
    ('h', '', '', 'h'),
5274
    ('j', '', '', 'dZ'),
5275
    ('kn', '^', '', 'n'),  # knight
5276
    ('mb', '', '$', 'm'),
5277
    ('ng', '', '$', '(N|ng)'),
5278
    ('pn', '^', '', '(pn|n)'),
5279
    ('ps', '^', '', '(ps|s)'),
5280
    ('qu', '', '', 'kw'),
5281
    ('q', '', '', 'k'),
5282
    ('tia', '', '', '(So|Sa)'),
5283
    ('tio', '', '', 'So'),
5284
    ('wr', '^', '', 'r'),
5285
    (
5286
        'w',
5287
        '',
5288
        '',
5289
        '(w|v)',
5290
    ),  # the variant "v" is for spellings coming from German/Polish  # noqa: E501
5291
    ('x', '^', '', 'z'),
5292
    ('x', '', '', 'ks'),
5293
    # VOWELS
5294
    ('yi', ' ', '', 'i'),
5295
    ('y', '^', '[aeiouy]', 'j'),
5296
    ('aue', '', '', 'aue'),
5297
    ('oue', '', '', '(aue|oue)'),
5298
    ('ai', '', '', '(aj|e)'),  # rain | said
5299
    ('ay', '', '', 'aj'),
5300
    ('a', '', '[^aeiou]e', 'aj'),  # plane (actually "ej")
5301
    ('a', '', '', '(e|o|a)'),  # hat | call | part
5302
    ('ei', '', '', '(aj|i)'),  # weigh | receive
5303
    ('ey', '', '', '(aj|i)'),  # hey | barley
5304
    ('ear', '', '', 'ia'),  # tear
5305
    ('ea', '', '', '(i|e)'),  # reason | treasure
5306
    ('ee', '', '', 'i'),  # between
5307
    ('e', '', '[^aeiou]e', 'i'),  # meter
5308
    ('e', '', '$', '(|E)'),  # blame, badge
5309
    ('e', '', '', 'E'),  # bed
5310
    ('ie', '', '', 'i'),  # believe
5311
    ('i', '', '[^aeiou]e', 'aj'),  # five
5312
    ('i', '', '', 'I'),  # hit -- Morse disagrees, feels it should go to I
5313
    ('oa', '', '', 'ou'),  # toad
5314
    ('oi', '', '', 'oj'),  # join
5315
    ('oo', '', '', 'u'),  # food
5316
    ('ou', '', '', '(u|ou)'),  # through | tough | could
5317
    ('oy', '', '', 'oj'),  # boy
5318
    ('o', '', '[^aeiou]e', 'ou'),  # rode
5319
    ('o', '', '', '(o|a)'),  # hot -- Morse disagrees, feels it should go to 9
5320
    ('u', '', '[^aeiou]e', '(ju|u)'),  # cute | flute
5321
    (
5322
        'u',
5323
        '',
5324
        'r',
5325
        '(e|u)',
5326
    ),  # turn -- Morse disagrees, feels it should go to E  # noqa: E501
5327
    ('u', '', '', '(u|a)'),  # put
5328
    ('y', '', '', 'i'),
5329
    # TRIVIAL
5330
    ('b', '', '', 'b'),
5331
    ('d', '', '', 'd'),
5332
    ('f', '', '', 'f'),
5333
    ('g', '', '', 'g'),
5334
    ('k', '', '', 'k'),
5335
    ('l', '', '', 'l'),
5336
    ('m', '', '', 'm'),
5337
    ('n', '', '', 'n'),
5338
    ('p', '', '', 'p'),
5339
    ('r', '', '', 'r'),
5340
    ('s', '', '', 's'),
5341
    ('t', '', '', 't'),
5342
    ('v', '', '', 'v'),
5343
    ('z', '', '', 'z'),
5344
)
5345
5346
# ash/rulesfrench.php
5347
5348
# Ashkenazic
5349 1
_ASH_RULES_FRENCH = (
5350
    # CONSONANTS
5351
    ('kh', '', '', 'x'),  # foreign
5352
    ('ph', '', '', 'f'),
5353
    ('ç', '', '', 's'),
5354
    ('x', '', '', 'ks'),
5355
    ('ch', '', '', 'S'),
5356
    ('c', '', '[eiyéèê]', 's'),
5357
    ('c', '', '', 'k'),
5358
    ('gn', '', '', '(n|gn)'),
5359
    ('g', '', '[eiy]', 'Z'),
5360
    ('gue', '', '$', 'k'),
5361
    ('gu', '', '[eiy]', 'g'),
5362
    # ("aill","","e","aj"), # non Jewish
5363
    # ("ll","","e","(l|j)"), # non Jewish
5364
    ('que', '', '$', 'k'),
5365
    ('qu', '', '', 'k'),
5366
    ('q', '', '', 'k'),
5367
    ('s', '[aeiouyéèê]', '[aeiouyéèê]', 'z'),
5368
    ('h', '[bdgt]', '', ''),  # translit from Arabic
5369
    ('h', '', '$', ''),  # foreign
5370
    ('j', '', '', 'Z'),
5371
    ('w', '', '', 'v'),
5372
    ('ouh', '', '[aioe]', '(v|uh)'),
5373
    ('ou', '', '[aeio]', 'v'),
5374
    ('uo', '', '', '(vo|o)'),
5375
    ('u', '', '[aeio]', 'v'),
5376
    # VOWELS
5377
    ('aue', '', '', 'aue'),
5378
    ('eau', '', '', 'o'),
5379
    # ("au","","","(o|au)"), # non Jewish
5380
    ('ai', '', '', 'aj'),  # [e] is non Jewish
5381
    ('ay', '', '', 'aj'),  # [e] is non Jewish
5382
    ('é', '', '', 'e'),
5383
    ('ê', '', '', 'e'),
5384
    ('è', '', '', 'e'),
5385
    ('à', '', '', 'a'),
5386
    ('â', '', '', 'a'),
5387
    ('où', '', '', 'u'),
5388
    ('ou', '', '', 'u'),
5389
    ('oi', '', '', 'oj'),  # [ua] is non Jewish
5390
    ('ei', '', '', 'aj'),  # [e] is non Jewish
5391
    ('ey', '', '', 'aj'),  # [e] non Jewish
5392
    # ("eu","","","(e|o)"), # non Jewish
5393
    ('y', '[ou]', '', 'j'),
5394
    ('e', '', '$', '(e|)'),
5395
    ('i', '', '[aou]', 'j'),
5396
    ('y', '', '[aoeu]', 'j'),
5397
    ('yi', '', '', 'i'),
5398
    ('ii', '', '', 'i'),
5399
    ('yy', '', '', 'i'),
5400
    ('y', '', '', 'i'),
5401
    # TRIVIAL
5402
    ('a', '', '', 'a'),
5403
    ('b', '', '', 'b'),
5404
    ('d', '', '', 'd'),
5405
    ('e', '', '', 'E'),  # only Ashkenazic
5406
    ('f', '', '', 'f'),
5407
    ('g', '', '', 'g'),
5408
    ('h', '', '', 'h'),
5409
    ('i', '', '', 'I'),  # only Ashkenazic
5410
    ('k', '', '', 'k'),
5411
    ('l', '', '', 'l'),
5412
    ('m', '', '', 'm'),
5413
    ('n', '', '', 'n'),
5414
    ('o', '', '', 'o'),
5415
    ('p', '', '', 'p'),
5416
    ('r', '', '', 'r'),
5417
    ('s', '', '', 's'),
5418
    ('t', '', '', 't'),
5419
    ('u', '', '', 'u'),
5420
    ('v', '', '', 'v'),
5421
    ('z', '', '', 'z'),
5422
)
5423
5424
# ash/rulesgerman.php
5425
5426
# Ashkenazic
5427 1
_ASH_RULES_GERMAN = (
5428
    # CONSONANTS
5429
    ('ziu', '', '', 'tsu'),
5430
    ('zia', '', '', 'tsa'),
5431
    ('zio', '', '', 'tso'),
5432
    ('ssch', '', '', 'S'),
5433
    ('chsch', '', '', 'xS'),
5434
    ('ewitsch', '', '$', 'evitS'),
5435
    ('owitsch', '', '$', 'ovitS'),
5436
    ('evitsch', '', '$', 'evitS'),
5437
    ('ovitsch', '', '$', 'ovitS'),
5438
    ('witsch', '', '$', 'vitS'),
5439
    ('vitsch', '', '$', 'vitS'),
5440
    ('sch', '', '', 'S'),
5441
    ('chs', '', '', 'ks'),
5442
    ('ch', '', '', 'x'),
5443
    ('ck', '', '', 'k'),
5444
    ('c', '', '[eiy]', 'ts'),
5445
    ('sp', '^', '', 'Sp'),
5446
    ('st', '^', '', 'St'),
5447
    ('ssp', '', '', '(Sp|sp)'),
5448
    ('sp', '', '', '(Sp|sp)'),
5449
    ('sst', '', '', '(St|st)'),
5450
    ('st', '', '', '(St|st)'),
5451
    ('pf', '', '', '(pf|p|f)'),
5452
    ('ph', '', '', '(ph|f)'),
5453
    ('qu', '', '', 'kv'),
5454
    ('ewitz', '', '$', '(evits|evitS)'),
5455
    ('ewiz', '', '$', '(evits|evitS)'),
5456
    ('evitz', '', '$', '(evits|evitS)'),
5457
    ('eviz', '', '$', '(evits|evitS)'),
5458
    ('owitz', '', '$', '(ovits|ovitS)'),
5459
    ('owiz', '', '$', '(ovits|ovitS)'),
5460
    ('ovitz', '', '$', '(ovits|ovitS)'),
5461
    ('oviz', '', '$', '(ovits|ovitS)'),
5462
    ('witz', '', '$', '(vits|vitS)'),
5463
    ('wiz', '', '$', '(vits|vitS)'),
5464
    ('vitz', '', '$', '(vits|vitS)'),
5465
    ('viz', '', '$', '(vits|vitS)'),
5466
    ('tz', '', '', 'ts'),
5467
    ('thal', '', '$', 'tal'),
5468
    ('th', '^', '', 't'),
5469
    ('th', '', '[äöüaeiou]', '(t|th)'),
5470
    ('th', '', '', 't'),
5471
    ('rh', '^', '', 'r'),
5472
    ('h', '[aeiouyäöü]', '', ''),
5473
    ('h', '^', '', 'H'),
5474
    ('ss', '', '', 's'),
5475
    ('s', '', '[äöüaeiouy]', '(z|s)'),
5476
    ('s', '[aeiouyäöüj]', '[aeiouyäöü]', 'z'),
5477
    ('ß', '', '', 's'),
5478
    # VOWELS
5479
    ('ij', '', '$', 'i'),
5480
    ('aue', '', '', 'aue'),
5481
    ('ue', '', '', 'Q'),
5482
    ('ae', '', '', 'Y'),
5483
    ('oe', '', '', 'Y'),
5484
    ('ü', '', '', 'Q'),
5485
    ('ä', '', '', '(Y|e)'),
5486
    ('ö', '', '', 'Y'),
5487
    ('ei', '', '', 'aj'),
5488
    ('ey', '', '', 'aj'),
5489
    ('eu', '', '', '(aj|oj)'),
5490
    ('i', '[aou]', '', 'j'),
5491
    ('y', '[aou]', '', 'j'),
5492
    ('ie', '', '', 'I'),
5493
    ('i', '', '[aou]', 'j'),
5494
    ('y', '', '[aoeu]', 'j'),
5495
    # FOREIGN LETTERs
5496
    ('ñ', '', '', 'n'),
5497
    ('ã', '', '', 'a'),
5498
    ('ő', '', '', 'o'),
5499
    ('ű', '', '', 'u'),
5500
    ('ç', '', '', 's'),
5501
    # ALPHABET
5502
    ('a', '', '', 'A'),
5503
    ('b', '', '', 'b'),
5504
    ('c', '', '', 'k'),
5505
    ('d', '', '', 'd'),
5506
    ('e', '', '', 'E'),
5507
    ('f', '', '', 'f'),
5508
    ('g', '', '', 'g'),
5509
    ('h', '', '', 'h'),
5510
    ('i', '', '', 'I'),
5511
    ('j', '', '', 'j'),
5512
    ('k', '', '', 'k'),
5513
    ('l', '', '', 'l'),
5514
    ('m', '', '', 'm'),
5515
    ('n', '', '', 'n'),
5516
    ('o', '', '', 'O'),
5517
    ('p', '', '', 'p'),
5518
    ('q', '', '', 'k'),
5519
    ('r', '', '', 'r'),
5520
    ('s', '', '', 's'),
5521
    ('t', '', '', 't'),
5522
    ('u', '', '', 'U'),
5523
    ('v', '', '', '(f|v)'),
5524
    ('w', '', '', 'v'),
5525
    ('x', '', '', 'ks'),
5526
    ('y', '', '', 'i'),
5527
    ('z', '', '', 'ts'),
5528
)
5529
5530
# ash/ruleshebrew.php
5531
5532
# Ashkenazic
5533 1
_ASH_RULES_HEBREW = (
5534
    ('אי', '', '', 'i'),
5535
    ('עי', '', '', 'i'),
5536
    ('עו', '', '', 'VV'),
5537
    ('או', '', '', 'VV'),
5538
    ('ג׳', '', '', 'Z'),
5539
    ('ד׳', '', '', 'dZ'),
5540
    ('א', '', '', 'L'),
5541
    ('ב', '', '', 'b'),
5542
    ('ג', '', '', 'g'),
5543
    ('ד', '', '', 'd'),
5544
    ('ה', '^', '', '1'),
5545
    ('ה', '', '$', '1'),
5546
    ('ה', '', '', ''),
5547
    ('וו', '', '', 'V'),
5548
    ('וי', '', '', 'WW'),
5549
    ('ו', '', '', 'W'),
5550
    ('ז', '', '', 'z'),
5551
    ('ח', '', '', 'X'),
5552
    ('ט', '', '', 'T'),
5553
    ('יי', '', '', 'i'),
5554
    ('י', '', '', 'i'),
5555
    ('ך', '', '', 'X'),
5556
    ('כ', '^', '', 'K'),
5557
    ('כ', '', '', 'k'),
5558
    ('ל', '', '', 'l'),
5559
    ('ם', '', '', 'm'),
5560
    ('מ', '', '', 'm'),
5561
    ('ן', '', '', 'n'),
5562
    ('נ', '', '', 'n'),
5563
    ('ס', '', '', 's'),
5564
    ('ע', '', '', 'L'),
5565
    ('ף', '', '', 'f'),
5566
    ('פ', '', '', 'f'),
5567
    ('ץ', '', '', 'C'),
5568
    ('צ', '', '', 'C'),
5569
    ('ק', '', '', 'K'),
5570
    ('ר', '', '', 'r'),
5571
    ('ש', '', '', 's'),
5572
    ('ת', '', '', 'TB'),  # only Ashkenazic
5573
)
5574
5575
# ash/ruleshungarian.php
5576
5577
# ASHKENAZIC
5578 1
_ASH_RULES_HUNGARIAN = (
5579
    # CONSONANTS
5580
    ('sz', '', '', 's'),
5581
    ('zs', '', '', 'Z'),
5582
    ('cs', '', '', 'tS'),
5583
    ('ay', '', '', '(oj|aj)'),
5584
    ('ai', '', '', '(oj|aj)'),
5585
    ('aj', '', '', '(oj|aj)'),
5586
    ('ei', '', '', 'aj'),  # German element
5587
    ('ey', '', '', 'aj'),  # German element
5588
    ('y', '[áo]', '', 'j'),
5589
    ('i', '[áo]', '', 'j'),
5590
    ('ee', '', '', '(aj|e)'),  # actually ej
5591
    ('ely', '', '', '(aj|eli)'),  # actually ej
5592
    ('ly', '', '', '(j|li)'),
5593
    ('gy', '', '[aeouáéóúüöőű]', 'dj'),
5594
    ('gy', '', '', '(d|gi)'),
5595
    ('ny', '', '[aeouáéóúüöőű]', 'nj'),
5596
    ('ny', '', '', '(n|ni)'),
5597
    ('ty', '', '[aeouáéóúüöőű]', 'tj'),
5598
    ('ty', '', '', '(t|ti)'),
5599
    ('qu', '', '', '(ku|kv)'),
5600
    ('h', '', '$', ''),
5601
    # VOWELS
5602
    ('á', '', '', 'a'),
5603
    ('é', '', '', 'e'),
5604
    ('í', '', '', 'i'),
5605
    ('ó', '', '', 'o'),
5606
    ('ö', '', '', 'Y'),
5607
    ('ő', '', '', 'Y'),
5608
    ('ú', '', '', 'u'),
5609
    ('ü', '', '', 'Q'),
5610
    ('ű', '', '', 'Q'),
5611
    # LATIN ALPHABET
5612
    ('a', '', '', 'a'),
5613
    ('b', '', '', 'b'),
5614
    ('c', '', '', 'ts'),
5615
    ('d', '', '', 'd'),
5616
    ('e', '', '', 'E'),
5617
    ('f', '', '', 'f'),
5618
    ('g', '', '', 'g'),
5619
    ('h', '', '', 'h'),
5620
    ('i', '', '', 'I'),
5621
    ('j', '', '', 'j'),
5622
    ('k', '', '', 'k'),
5623
    ('l', '', '', 'l'),
5624
    ('m', '', '', 'm'),
5625
    ('n', '', '', 'n'),
5626
    ('o', '', '', 'o'),
5627
    ('p', '', '', 'p'),
5628
    ('q', '', '', 'k'),
5629
    ('r', '', '', 'r'),
5630
    ('s', '', '', '(S|s)'),
5631
    ('t', '', '', 't'),
5632
    ('u', '', '', 'u'),
5633
    ('v', '', '', 'v'),
5634
    ('w', '', '', 'v'),
5635
    ('x', '', '', 'ks'),
5636
    ('y', '', '', 'i'),
5637
    ('z', '', '', 'z'),
5638
)
5639
5640
# ash/rulespolish.php
5641
5642
# Ashkenazic
5643 1
_ASH_RULES_POLISH = (
5644
    # CONVERTING FEMININE TO MASCULINE
5645
    ('ska', '', '$', 'ski'),
5646
    ('cka', '', '$', 'tski'),
5647
    ('lowa', '', '$', '(lova|lof|l|el)'),
5648
    ('kowa', '', '$', '(kova|kof|k|ek)'),
5649
    ('owa', '', '$', '(ova|of|)'),
5650
    ('lowna', '', '$', '(lovna|levna|l|el)'),
5651
    ('kowna', '', '$', '(kovna|k|ek)'),
5652
    ('owna', '', '$', '(ovna|)'),
5653
    ('lówna', '', '$', '(l|el)'),
5654
    ('kówna', '', '$', '(k|ek)'),
5655
    ('ówna', '', '$', ''),
5656
    ('a', '', '$', '(a|i)'),
5657
    # CONSONANTS
5658
    ('czy', '', '', 'tSi'),
5659
    ('cze', '', '[bcdgkpstwzż]', '(tSe|tSF)'),
5660
    ('ciewicz', '', '', '(tsevitS|tSevitS)'),
5661
    ('siewicz', '', '', '(sevitS|SevitS)'),
5662
    ('ziewicz', '', '', '(zevitS|ZevitS)'),
5663
    ('riewicz', '', '', 'rjevitS'),
5664
    ('diewicz', '', '', 'djevitS'),
5665
    ('tiewicz', '', '', 'tjevitS'),
5666
    ('iewicz', '', '', 'evitS'),
5667
    ('ewicz', '', '', 'evitS'),
5668
    ('owicz', '', '', 'ovitS'),
5669
    ('icz', '', '', 'itS'),
5670
    ('cz', '', '', 'tS'),
5671
    ('ch', '', '', 'x'),
5672
    ('cia', '', '[bcdgkpstwzż]', '(tSB|tsB)'),
5673
    ('cia', '', '', '(tSa|tsa)'),
5674
    ('cią', '', '[bp]', '(tSom|tsom)'),
5675
    ('cią', '', '', '(tSon|tson)'),
5676
    ('cię', '', '[bp]', '(tSem|tsem)'),
5677
    ('cię', '', '', '(tSen|tsen)'),
5678
    ('cie', '', '[bcdgkpstwzż]', '(tSF|tsF)'),
5679
    ('cie', '', '', '(tSe|tse)'),
5680
    ('cio', '', '', '(tSo|tso)'),
5681
    ('ciu', '', '', '(tSu|tsu)'),
5682
    ('ci', '', '', '(tSi|tsI)'),
5683
    ('ć', '', '', '(tS|ts)'),
5684
    ('ssz', '', '', 'S'),
5685
    ('sz', '', '', 'S'),
5686
    ('sia', '', '[bcdgkpstwzż]', '(SB|sB|sja)'),
5687
    ('sia', '', '', '(Sa|sja)'),
5688
    ('sią', '', '[bp]', '(Som|som)'),
5689
    ('sią', '', '', '(Son|son)'),
5690
    ('się', '', '[bp]', '(Sem|sem)'),
5691
    ('się', '', '', '(Sen|sen)'),
5692
    ('sie', '', '[bcdgkpstwzż]', '(SF|sF|se)'),
5693
    ('sie', '', '', '(Se|se)'),
5694
    ('sio', '', '', '(So|so)'),
5695
    ('siu', '', '', '(Su|sju)'),
5696
    ('si', '', '', '(Si|sI)'),
5697
    ('ś', '', '', '(S|s)'),
5698
    ('zia', '', '[bcdgkpstwzż]', '(ZB|zB|zja)'),
5699
    ('zia', '', '', '(Za|zja)'),
5700
    ('zią', '', '[bp]', '(Zom|zom)'),
5701
    ('zią', '', '', '(Zon|zon)'),
5702
    ('zię', '', '[bp]', '(Zem|zem)'),
5703
    ('zię', '', '', '(Zen|zen)'),
5704
    ('zie', '', '[bcdgkpstwzż]', '(ZF|zF)'),
5705
    ('zie', '', '', '(Ze|ze)'),
5706
    ('zio', '', '', '(Zo|zo)'),
5707
    ('ziu', '', '', '(Zu|zju)'),
5708
    ('zi', '', '', '(Zi|zI)'),
5709
    ('że', '', '[bcdgkpstwzż]', '(Ze|ZF)'),
5710
    ('że', '', '[bcdgkpstwzż]', '(Ze|ZF|ze|zF)'),
5711
    ('że', '', '', 'Ze'),
5712
    ('źe', '', '', '(Ze|ze)'),
5713
    ('ży', '', '', 'Zi'),
5714
    ('źi', '', '', '(Zi|zi)'),
5715
    ('ż', '', '', 'Z'),
5716
    ('ź', '', '', '(Z|z)'),
5717
    ('rze', 't', '', '(Se|re)'),
5718
    ('rze', '', '', '(Ze|re|rZe)'),
5719
    ('rzy', 't', '', '(Si|ri)'),
5720
    ('rzy', '', '', '(Zi|ri|rZi)'),
5721
    ('rz', 't', '', '(S|r)'),
5722
    ('rz', '', '', '(Z|r|rZ)'),
5723
    ('lio', '', '', '(lo|le)'),
5724
    ('ł', '', '', 'l'),
5725
    ('ń', '', '', 'n'),
5726
    ('qu', '', '', 'k'),
5727
    ('s', '', 's', ''),
5728
    # VOWELS
5729
    ('ó', '', '', '(u|o)'),
5730
    ('ą', '', '[bp]', 'om'),
5731
    ('ę', '', '[bp]', 'em'),
5732
    ('ą', '', '', 'on'),
5733
    ('ę', '', '', 'en'),
5734
    ('ije', '', '', 'je'),
5735
    ('yje', '', '', 'je'),
5736
    ('iie', '', '', 'je'),
5737
    ('yie', '', '', 'je'),
5738
    ('iye', '', '', 'je'),
5739
    ('yye', '', '', 'je'),
5740
    ('ij', '', '[aou]', 'j'),
5741
    ('yj', '', '[aou]', 'j'),
5742
    ('ii', '', '[aou]', 'j'),
5743
    ('yi', '', '[aou]', 'j'),
5744
    ('iy', '', '[aou]', 'j'),
5745
    ('yy', '', '[aou]', 'j'),
5746
    ('rie', '', '', 'rje'),
5747
    ('die', '', '', 'dje'),
5748
    ('tie', '', '', 'tje'),
5749
    ('ie', '', '[bcdgkpstwzż]', 'F'),
5750
    ('ie', '', '', 'e'),
5751
    ('aue', '', '', 'aue'),
5752
    ('au', '', '', 'au'),
5753
    ('ei', '', '', 'aj'),
5754
    ('ey', '', '', 'aj'),
5755
    ('ej', '', '', 'aj'),
5756
    ('ai', '', '', 'aj'),
5757
    ('ay', '', '', 'aj'),
5758
    ('aj', '', '', 'aj'),
5759
    ('i', '[ou]', '', 'j'),
5760
    ('y', '[ou]', '', 'j'),
5761
    ('i', '', '[aou]', 'j'),
5762
    ('y', '', '[aeou]', 'j'),
5763
    ('a', '', '[bcdgkpstwzż]', 'B'),
5764
    ('e', '', '[bcdgkpstwzż]', '(E|F)'),
5765
    ('o', '', '[bcćdgklłmnńrsśtwzźż]', 'P'),
5766
    # ALPHABET
5767
    ('a', '', '', 'a'),
5768
    ('b', '', '', 'b'),
5769
    ('c', '', '', 'ts'),
5770
    ('d', '', '', 'd'),
5771
    ('e', '', '', 'E'),
5772
    ('f', '', '', 'f'),
5773
    ('g', '', '', 'g'),
5774
    ('h', '', '', '(h|x)'),
5775
    ('i', '', '', 'I'),
5776
    ('j', '', '', 'j'),
5777
    ('k', '', '', 'k'),
5778
    ('l', '', '', 'l'),
5779
    ('m', '', '', 'm'),
5780
    ('n', '', '', 'n'),
5781
    ('o', '', '', 'o'),
5782
    ('p', '', '', 'p'),
5783
    ('q', '', '', 'k'),
5784
    ('r', '', '', 'r'),
5785
    ('s', '', '', 's'),
5786
    ('t', '', '', 't'),
5787
    ('u', '', '', 'u'),
5788
    ('v', '', '', 'v'),
5789
    ('w', '', '', 'v'),
5790
    ('x', '', '', 'ks'),
5791
    ('y', '', '', 'I'),
5792
    ('z', '', '', 'z'),
5793
)
5794
5795
# ash/rulesromanian.php
5796
5797 1
_ASH_RULES_ROMANIAN = (
5798
    ('j', '', '', 'Z'),
5799
    ('ce', '', '', 'tSe'),
5800
    ('ci', '', '', '(tSi|tS)'),
5801
    ('ch', '', '[ei]', 'k'),
5802
    ('ch', '', '', 'x'),  # foreign
5803
    ('c', '', '', 'k'),
5804
    ('gi', '', '', '(dZi|dZ)'),
5805
    ('g', '', '[ei]', 'dZ'),
5806
    ('gh', '', '', 'g'),
5807
    ('ei', '', '', 'aj'),
5808
    ('i', '[aou]', '', 'j'),
5809
    ('i', '', '[aeou]', 'j'),
5810
    ('ţ', '', '', 'ts'),
5811
    ('ş', '', '', 'S'),
5812
    ('h', '', '', '(x|h)'),
5813
    ('qu', '', '', 'k'),
5814
    ('q', '', '', 'k'),
5815
    ('w', '', '', 'v'),
5816
    ('x', '', '', 'ks'),
5817
    ('y', '', '', 'i'),
5818
    ('î', '', '', 'i'),
5819
    ('ea', '', '', 'ja'),
5820
    ('ă', '', '', '(e|a)'),
5821
    ('aue', '', '', 'aue'),
5822
    ('a', '', '', 'a'),
5823
    ('b', '', '', 'b'),
5824
    ('d', '', '', 'd'),
5825
    ('e', '', '', 'E'),
5826
    ('f', '', '', 'f'),
5827
    ('g', '', '', 'g'),
5828
    ('i', '', '', 'I'),
5829
    ('k', '', '', 'k'),
5830
    ('l', '', '', 'l'),
5831
    ('m', '', '', 'm'),
5832
    ('n', '', '', 'n'),
5833
    ('o', '', '', 'o'),
5834
    ('p', '', '', 'p'),
5835
    ('r', '', '', 'r'),
5836
    ('s', '', '', 's'),
5837
    ('t', '', '', 't'),
5838
    ('u', '', '', 'u'),
5839
    ('v', '', '', 'v'),
5840
    ('z', '', '', 'z'),
5841
)
5842
5843
# ash/rulesrussian.php
5844
5845 1
_ASH_RULES_RUSSIAN = (
5846
    # CONVERTING FEMININE TO MASCULINE
5847
    ('yna', '', '$', '(in|ina)'),
5848
    ('ina', '', '$', '(in|ina)'),
5849
    ('liova', '', '$', '(lof|lef)'),
5850
    ('lova', '', '$', '(lof|lef|lova)'),
5851
    ('ova', '', '$', '(of|ova)'),
5852
    ('eva', '', '$', '(ef|ova)'),
5853
    ('aia', '', '$', '(aja|i)'),
5854
    ('aja', '', '$', '(aja|i)'),
5855
    ('aya', '', '$', '(aja|i)'),
5856
    # SPECIFIC CONSONANTS
5857
    ('tsya', '', '', 'tsa'),
5858
    ('tsyu', '', '', 'tsu'),
5859
    ('tsia', '', '', 'tsa'),
5860
    ('tsie', '', '', 'tse'),
5861
    ('tsio', '', '', 'tso'),
5862
    ('tsye', '', '', 'tse'),
5863
    ('tsyo', '', '', 'tso'),
5864
    ('tsiu', '', '', 'tsu'),
5865
    ('sie', '', '', 'se'),
5866
    ('sio', '', '', 'so'),
5867
    ('zie', '', '', 'ze'),
5868
    ('zio', '', '', 'zo'),
5869
    ('sye', '', '', 'se'),
5870
    ('syo', '', '', 'so'),
5871
    ('zye', '', '', 'ze'),
5872
    ('zyo', '', '', 'zo'),
5873
    ('gauz', '', '$', 'haus'),
5874
    ('gaus', '', '$', 'haus'),
5875
    ('gol\'ts', '', '$', 'holts'),
5876
    ('golts', '', '$', 'holts'),
5877
    ('gol\'tz', '', '$', 'holts'),
5878
    ('goltz', '', '$', 'holts'),
5879
    ('gejmer', '', '$', 'hajmer'),
5880
    ('gejm', '', '$', 'hajm'),
5881
    ('geimer', '', '$', 'hajmer'),
5882
    ('geim', '', '$', 'hajm'),
5883
    ('geymer', '', '$', 'hajmer'),
5884
    ('geym', '', '$', 'hajm'),
5885
    ('gendler', '', '$', 'hendler'),
5886
    ('gof', '', '$', 'hof'),
5887
    ('gojf', '', '$', 'hojf'),
5888
    ('goyf', '', '$', 'hojf'),
5889
    ('goif', '', '$', 'hojf'),
5890
    ('ger', '', '$', 'ger'),
5891
    ('gen', '', '$', 'gen'),
5892
    ('gin', '', '$', 'gin'),
5893
    ('gg', '', '', 'g'),
5894
    ('g', '[jaeoiuy]', '[aeoiu]', 'g'),
5895
    ('g', '', '[aeoiu]', '(g|h)'),
5896
    ('kh', '', '', 'x'),
5897
    (
5898
        'ch',
5899
        '',
5900
        '',
5901
        '(tS|x)',
5902
    ),  # in DJSRE the rule is simpler: ("ch","","","tS")  # noqa: E501
5903
    ('sch', '', '', '(StS|S)'),
5904
    ('ssh', '', '', 'S'),
5905
    ('sh', '', '', 'S'),
5906
    ('zh', '', '', 'Z'),
5907
    ('tz', '', '$', 'ts'),  # not in DJSRE
5908
    ('tz', '', '', '(ts|tz)'),  # not in DJSRE
5909
    ('c', '', '[iey]', 's'),  # not in DJSRE
5910
    ('c', '', '', 'k'),  # not in DJSRE
5911
    ('qu', '', '', '(kv|k)'),  # not in DJSRE
5912
    ('q', '', '', 'k'),  # not in DJSRE
5913
    ('s', '', 's', ''),
5914
    ('w', '', '', 'v'),  # not in DJSRE
5915
    ('x', '', '', 'ks'),  # not in DJSRE
5916
    # SPECIFIC VOWELS
5917
    ('lya', '', '', 'la'),
5918
    ('lyu', '', '', 'lu'),
5919
    ('lia', '', '', 'la'),  # not in DJSRE
5920
    ('liu', '', '', 'lu'),  # not in DJSRE
5921
    ('lja', '', '', 'la'),  # not in DJSRE
5922
    ('lju', '', '', 'lu'),  # not in DJSRE
5923
    ('le', '', '', '(lo|lE)'),  # not in DJSRE
5924
    ('lyo', '', '', '(lo|le)'),  # not in DJSRE
5925
    ('lio', '', '', '(lo|le)'),
5926
    ('ije', '', '', 'je'),
5927
    ('ie', '', '', 'je'),
5928
    ('iye', '', '', 'je'),
5929
    ('iie', '', '', 'je'),
5930
    ('yje', '', '', 'je'),
5931
    ('ye', '', '', 'je'),
5932
    ('yye', '', '', 'je'),
5933
    ('yie', '', '', 'je'),
5934
    ('ij', '', '[aou]', 'j'),
5935
    ('iy', '', '[aou]', 'j'),
5936
    ('ii', '', '[aou]', 'j'),
5937
    ('yj', '', '[aou]', 'j'),
5938
    ('yy', '', '[aou]', 'j'),
5939
    ('yi', '', '[aou]', 'j'),
5940
    ('io', '', '', '(jo|e)'),
5941
    ('i', '', '[au]', 'j'),
5942
    ('i', '[aou]', '', 'j'),  # not in DJSRE
5943
    ('ei', '', '', 'aj'),  # not in DJSRE
5944
    ('ey', '', '', 'aj'),  # not in DJSRE
5945
    ('ej', '', '', 'aj'),
5946
    ('yo', '', '', '(jo|e)'),  # not in DJSRE
5947
    ('y', '', '[au]', 'j'),
5948
    ('y', '[aiou]', '', 'j'),  # not in DJSRE
5949
    ('ii', '', ' ', 'i'),  # not in DJSRE
5950
    ('iy', '', ' ', 'i'),  # not in DJSRE
5951
    ('yy', '', ' ', 'i'),  # not in DJSRE
5952
    ('yi', '', ' ', 'i'),  # not in DJSRE
5953
    ('yj', '', '$', 'i'),
5954
    ('ij', '', '$', 'i'),
5955
    (
5956
        'e',
5957
        '^',
5958
        '',
5959
        '(je|E)',
5960
    ),  # in DJSRE the rule is simpler: ("e","^","","je")  # noqa: E501
5961
    (
5962
        'ee',
5963
        '',
5964
        '',
5965
        '(aje|i)',
5966
    ),  # in DJSRE the rule is simpler: ("ee","","","(eje|aje)")  # noqa: E501
5967
    ('e', '[aou]', '', 'je'),
5968
    ('y', '', '', 'I'),
5969
    ('oo', '', '', '(oo|u)'),  # not in DJSRE
5970
    ('\'', '', '', ''),
5971
    ('"', '', '', ''),
5972
    ('aue', '', '', 'aue'),
5973
    # TRIVIAL
5974
    ('a', '', '', 'a'),
5975
    ('b', '', '', 'b'),
5976
    ('d', '', '', 'd'),
5977
    ('e', '', '', 'E'),
5978
    ('f', '', '', 'f'),
5979
    ('g', '', '', 'g'),
5980
    ('h', '', '', 'h'),  # not in DJSRE
5981
    ('i', '', '', 'I'),
5982
    ('j', '', '', 'j'),
5983
    ('k', '', '', 'k'),
5984
    ('l', '', '', 'l'),
5985
    ('m', '', '', 'm'),
5986
    ('n', '', '', 'n'),
5987
    ('o', '', '', 'o'),
5988
    ('p', '', '', 'p'),
5989
    ('r', '', '', 'r'),
5990
    ('s', '', '', 's'),
5991
    ('t', '', '', 't'),
5992
    ('u', '', '', 'u'),
5993
    ('v', '', '', 'v'),
5994
    ('z', '', '', 'z'),
5995
)
5996
5997
# ash/rulesspanish.php
5998
5999
# Ashkenazic = Argentina
6000 1
_ASH_RULES_SPANISH = (
6001
    # CONSONANTS
6002
    ('ñ', '', '', '(n|nj)'),
6003
    ('ch', '', '', '(tS|dZ)'),  # dZ is typical for Argentina
6004
    ('h', '[bdgt]', '', ''),  # translit. from Arabic
6005
    ('h', '', '$', ''),  # foreign
6006
    ('j', '', '', 'x'),
6007
    ('x', '', '', 'ks'),
6008
    ('ll', '', '', '(l|Z)'),  # Z is typical for Argentina, only Ashkenazic
6009
    ('w', '', '', 'v'),  # foreign words
6010
    ('v', '', '', '(b|v)'),
6011
    ('b', '', '', '(b|v)'),
6012
    ('m', '', '[bpvf]', '(m|n)'),
6013
    ('c', '', '[ei]', 's'),
6014
    ('c', '', '', 'k'),
6015
    (
6016
        'z',
6017
        '',
6018
        '',
6019
        '(z|s)',
6020
    ),  # as "c" befoire "e" or "i", in Spain it is like unvoiced English "th"  # noqa: E501
6021
    ('gu', '', '[ei]', '(g|gv)'),  # "gv" because "u" can actually be "ü"
6022
    ('g', '', '[ei]', '(x|g)'),  # "g" only for foreign words
6023
    ('qu', '', '', 'k'),
6024
    ('q', '', '', 'k'),
6025
    ('uo', '', '', '(vo|o)'),
6026
    ('u', '', '[aei]', 'v'),
6027
    (
6028
        'y',
6029
        '',
6030
        '',
6031
        '(i|j|S|Z)',
6032
    ),  # S or Z are peculiar to South America; only Ashkenazic  # noqa: E501
6033
    # VOWELS
6034
    ('ü', '', '', 'v'),
6035
    ('á', '', '', 'a'),
6036
    ('é', '', '', 'e'),
6037
    ('í', '', '', 'i'),
6038
    ('ó', '', '', 'o'),
6039
    ('ú', '', '', 'u'),
6040
    # TRIVIAL
6041
    ('a', '', '', 'a'),
6042
    ('d', '', '', 'd'),
6043
    ('e', '', '', 'E'),  # Only Ashkenazic
6044
    ('f', '', '', 'f'),
6045
    ('g', '', '', 'g'),
6046
    ('h', '', '', 'h'),
6047
    ('i', '', '', 'I'),  # Only Ashkenazic
6048
    ('k', '', '', 'k'),
6049
    ('l', '', '', 'l'),
6050
    ('m', '', '', 'm'),
6051
    ('n', '', '', 'n'),
6052
    ('o', '', '', 'o'),
6053
    ('p', '', '', 'p'),
6054
    ('r', '', '', 'r'),
6055
    ('s', '', '', 's'),
6056
    ('t', '', '', 't'),
6057
    ('u', '', '', 'u'),
6058
)
6059
6060 1
BMDATA = {}
6061
6062 1
BMDATA['gen'] = {}
6063 1
BMDATA['gen']['approx'] = {}
6064 1
BMDATA['gen']['exact'] = {}
6065 1
BMDATA['gen']['rules'] = {}
6066 1
BMDATA['gen']['hebrew'] = {}
6067
6068 1
BMDATA['gen']['language_rules'] = _GEN_LANGUAGE_RULES
6069 1
BMDATA['gen']['languages'] = _GEN_LANGUAGES
6070 1
BMDATA['gen']['approx'][1] = _GEN_APPROX_ANY
6071 1
BMDATA['gen']['approx'][2] = _GEN_APPROX_ARABIC
6072 1
BMDATA['gen']['approx']['common'] = (
6073
    _GEN_EXACT_APPROX_COMMON + _GEN_APPROX_COMMON
6074
)  # noqa: E501
6075 1
BMDATA['gen']['approx'][4] = _GEN_APPROX_RUSSIAN
6076 1
BMDATA['gen']['approx'][8] = _GEN_APPROX_FRENCH
6077 1
BMDATA['gen']['approx'][16] = _GEN_APPROX_FRENCH
6078 1
BMDATA['gen']['approx'][32] = _GEN_APPROX_ENGLISH
6079 1
BMDATA['gen']['approx'][64] = _GEN_APPROX_FRENCH
6080 1
BMDATA['gen']['approx'][128] = _GEN_APPROX_GERMAN
6081 1
BMDATA['gen']['approx'][256] = _GEN_APPROX_FRENCH
6082 1
BMDATA['gen']['approx'][512] = _GEN_APPROX_FRENCH + _GEN_APPROX_GREEKLATIN
6083 1
BMDATA['gen']['approx'][1024] = _GEN_APPROX_HEBREW
6084 1
BMDATA['gen']['approx'][2048] = _GEN_APPROX_FRENCH
6085 1
BMDATA['gen']['approx'][4096] = _GEN_APPROX_FRENCH
6086 1
BMDATA['gen']['approx'][8192] = _GEN_APPROX_FRENCH
6087 1
BMDATA['gen']['approx'][16384] = _GEN_APPROX_POLISH
6088 1
BMDATA['gen']['approx'][32768] = _GEN_APPROX_FRENCH
6089 1
BMDATA['gen']['approx'][65536] = _GEN_APPROX_POLISH
6090 1
BMDATA['gen']['approx'][131072] = _GEN_APPROX_RUSSIAN
6091 1
BMDATA['gen']['approx'][262144] = _GEN_APPROX_FRENCH + _GEN_APPROX_SPANISH
6092 1
BMDATA['gen']['approx'][524288] = _GEN_APPROX_FRENCH
6093 1
BMDATA['gen']['exact'][1] = _GEN_EXACT_ANY
6094 1
BMDATA['gen']['exact'][2] = _GEN_EXACT_ARABIC
6095 1
BMDATA['gen']['exact']['common'] = _GEN_EXACT_APPROX_COMMON + _GEN_EXACT_COMMON
6096 1
BMDATA['gen']['exact'][4] = _GEN_EXACT_RUSSIAN
6097 1
BMDATA['gen']['exact'][8] = _GEN_EXACT_RUSSIAN
6098 1
BMDATA['gen']['exact'][16] = _GEN_EXACT_DUTCH
6099 1
BMDATA['gen']['exact'][32] = _GEN_EXACT_RUSSIAN
6100 1
BMDATA['gen']['exact'][64] = _GEN_EXACT_FRENCH
6101 1
BMDATA['gen']['exact'][128] = _GEN_EXACT_ANY
6102 1
BMDATA['gen']['exact'][256] = _GEN_EXACT_GREEK
6103 1
BMDATA['gen']['exact'][512] = _GEN_EXACT_GREEKLATIN
6104 1
BMDATA['gen']['exact'][1024] = _GEN_EXACT_HEBREW
6105 1
BMDATA['gen']['exact'][2048] = _GEN_EXACT_RUSSIAN
6106 1
BMDATA['gen']['exact'][4096] = _GEN_EXACT_ITALIAN
6107 1
BMDATA['gen']['exact'][8192] = _GEN_EXACT_LATVIAN
6108 1
BMDATA['gen']['exact'][16384] = _GEN_EXACT_POLISH
6109 1
BMDATA['gen']['exact'][32768] = _GEN_EXACT_PORTUGUESE
6110 1
BMDATA['gen']['exact'][65536] = _GEN_EXACT_RUSSIAN
6111 1
BMDATA['gen']['exact'][131072] = _GEN_EXACT_RUSSIAN
6112 1
BMDATA['gen']['exact'][262144] = _GEN_EXACT_SPANISH
6113 1
BMDATA['gen']['exact'][524288] = _GEN_EXACT_TURKISH
6114 1
BMDATA['gen']['hebrew']['common'] = (
6115
    _GEN_EXACT_APPROX_COMMON + _GEN_HEBREW_COMMON
6116
)  # noqa: E501
6117 1
BMDATA['gen']['rules'][1] = _GEN_RULES_ANY
6118 1
BMDATA['gen']['rules'][2] = _GEN_RULES_ARABIC
6119 1
BMDATA['gen']['rules'][4] = _GEN_RULES_CYRILLIC
6120 1
BMDATA['gen']['rules'][8] = _GEN_RULES_CZECH
6121 1
BMDATA['gen']['rules'][16] = _GEN_RULES_DUTCH
6122 1
BMDATA['gen']['rules'][32] = _GEN_RULES_ENGLISH
6123 1
BMDATA['gen']['rules'][64] = _GEN_RULES_FRENCH
6124 1
BMDATA['gen']['rules'][128] = _GEN_RULES_GERMAN
6125 1
BMDATA['gen']['rules'][256] = _GEN_RULES_GREEK
6126 1
BMDATA['gen']['rules'][512] = _GEN_RULES_GREEKLATIN
6127 1
BMDATA['gen']['rules'][1024] = _GEN_RULES_HEBREW
6128 1
BMDATA['gen']['rules'][2048] = _GEN_RULES_HUNGARIAN
6129 1
BMDATA['gen']['rules'][4096] = _GEN_RULES_ITALIAN
6130 1
BMDATA['gen']['rules'][8192] = _GEN_RULES_LATVIAN
6131 1
BMDATA['gen']['rules'][16384] = _GEN_RULES_POLISH
6132 1
BMDATA['gen']['rules'][32768] = _GEN_RULES_PORTUGUESE
6133 1
BMDATA['gen']['rules'][65536] = _GEN_RULES_ROMANIAN
6134 1
BMDATA['gen']['rules'][131072] = _GEN_RULES_RUSSIAN
6135 1
BMDATA['gen']['rules'][262144] = _GEN_RULES_SPANISH
6136 1
BMDATA['gen']['rules'][524288] = _GEN_RULES_TURKISH
6137
6138 1
BMDATA['sep'] = {}
6139 1
BMDATA['sep']['approx'] = {}
6140 1
BMDATA['sep']['exact'] = {}
6141 1
BMDATA['sep']['rules'] = {}
6142 1
BMDATA['sep']['hebrew'] = {}
6143
6144 1
BMDATA['sep']['language_rules'] = _SEP_LANGUAGE_RULES
6145 1
BMDATA['sep']['languages'] = _SEP_LANGUAGES
6146 1
BMDATA['sep']['approx'][1] = _SEP_APPROX_ANY
6147 1
BMDATA['sep']['approx']['common'] = (
6148
    _SEP_EXACT_APPROX_COMMON + _SEP_APPROX_COMMON
6149
)  # noqa: E501
6150 1
BMDATA['sep']['approx'][64] = _SEP_APPROX_FRENCH
6151 1
BMDATA['sep']['approx'][1024] = _SEP_APPROX_HEBREW
6152 1
BMDATA['sep']['approx'][4096] = _SEP_APPROX_FRENCH
6153 1
BMDATA['sep']['approx'][32768] = _SEP_APPROX_FRENCH
6154 1
BMDATA['sep']['approx'][262144] = _SEP_APPROX_FRENCH
6155 1
BMDATA['sep']['exact'][1] = _SEP_EXACT_ANY
6156 1
BMDATA['sep']['exact']['common'] = _SEP_EXACT_APPROX_COMMON + _SEP_EXACT_COMMON
6157 1
BMDATA['sep']['exact'][64] = _SEP_EXACT_FRENCH
6158 1
BMDATA['sep']['exact'][1024] = _SEP_EXACT_HEBREW
6159 1
BMDATA['sep']['exact'][4096] = _SEP_EXACT_ITALIAN
6160 1
BMDATA['sep']['exact'][32768] = _SEP_EXACT_PORTUGUESE
6161 1
BMDATA['sep']['exact'][262144] = _SEP_EXACT_SPANISH
6162 1
BMDATA['sep']['hebrew']['common'] = (
6163
    _SEP_EXACT_APPROX_COMMON + _SEP_HEBREW_COMMON
6164
)  # noqa: E501
6165 1
BMDATA['sep']['rules'][1] = _SEP_RULES_ANY
6166 1
BMDATA['sep']['rules'][64] = _SEP_RULES_FRENCH
6167 1
BMDATA['sep']['rules'][1024] = _SEP_RULES_HEBREW
6168 1
BMDATA['sep']['rules'][4096] = _SEP_RULES_ITALIAN
6169 1
BMDATA['sep']['rules'][32768] = _SEP_RULES_PORTUGUESE
6170 1
BMDATA['sep']['rules'][262144] = _SEP_RULES_SPANISH
6171
6172 1
BMDATA['ash'] = {}
6173 1
BMDATA['ash']['approx'] = {}
6174 1
BMDATA['ash']['exact'] = {}
6175 1
BMDATA['ash']['rules'] = {}
6176 1
BMDATA['ash']['hebrew'] = {}
6177
6178 1
BMDATA['ash']['language_rules'] = _ASH_LANGUAGE_RULES
6179 1
BMDATA['ash']['languages'] = _ASH_LANGUAGES
6180 1
BMDATA['ash']['approx'][1] = _ASH_APPROX_ANY
6181 1
BMDATA['ash']['approx']['common'] = (
6182
    _ASH_EXACT_APPROX_COMMON + _ASH_APPROX_COMMON
6183
)  # noqa: E501
6184 1
BMDATA['ash']['approx'][4] = _ASH_APPROX_RUSSIAN
6185 1
BMDATA['ash']['approx'][32] = _ASH_APPROX_ENGLISH
6186 1
BMDATA['ash']['approx'][64] = _ASH_APPROX_FRENCH
6187 1
BMDATA['ash']['approx'][128] = _ASH_APPROX_GERMAN
6188 1
BMDATA['ash']['approx'][1024] = _ASH_APPROX_HEBREW
6189 1
BMDATA['ash']['approx'][2048] = _ASH_APPROX_FRENCH
6190 1
BMDATA['ash']['approx'][16384] = _ASH_APPROX_POLISH
6191 1
BMDATA['ash']['approx'][65536] = _ASH_APPROX_POLISH
6192 1
BMDATA['ash']['approx'][131072] = _ASH_APPROX_RUSSIAN
6193 1
BMDATA['ash']['approx'][262144] = _ASH_APPROX_FRENCH
6194 1
BMDATA['ash']['exact'][1] = _ASH_EXACT_ANY
6195 1
BMDATA['ash']['exact']['common'] = _ASH_EXACT_APPROX_COMMON + _ASH_EXACT_COMMON
6196 1
BMDATA['ash']['exact'][4] = _ASH_EXACT_RUSSIAN
6197 1
BMDATA['ash']['exact'][32] = _ASH_EXACT_RUSSIAN
6198 1
BMDATA['ash']['exact'][64] = _ASH_EXACT_RUSSIAN
6199 1
BMDATA['ash']['exact'][128] = _ASH_EXACT_ANY
6200 1
BMDATA['ash']['exact'][1024] = _ASH_EXACT_HEBREW
6201 1
BMDATA['ash']['exact'][2048] = _ASH_EXACT_RUSSIAN
6202 1
BMDATA['ash']['exact'][16384] = _ASH_EXACT_POLISH
6203 1
BMDATA['ash']['exact'][65536] = _ASH_EXACT_RUSSIAN
6204 1
BMDATA['ash']['exact'][131072] = _ASH_EXACT_RUSSIAN
6205 1
BMDATA['ash']['exact'][262144] = _ASH_EXACT_RUSSIAN
6206 1
BMDATA['ash']['hebrew']['common'] = (
6207
    _ASH_EXACT_APPROX_COMMON + _ASH_HEBREW_COMMON
6208
)  # noqa: E501
6209 1
BMDATA['ash']['rules'][1] = _ASH_RULES_ANY
6210 1
BMDATA['ash']['rules'][4] = _ASH_RULES_CYRILLIC
6211 1
BMDATA['ash']['rules'][32] = _ASH_RULES_ENGLISH
6212 1
BMDATA['ash']['rules'][64] = _ASH_RULES_FRENCH
6213 1
BMDATA['ash']['rules'][128] = _ASH_RULES_GERMAN
6214 1
BMDATA['ash']['rules'][1024] = _ASH_RULES_HEBREW
6215 1
BMDATA['ash']['rules'][2048] = _ASH_RULES_HUNGARIAN
6216 1
BMDATA['ash']['rules'][16384] = _ASH_RULES_POLISH
6217 1
BMDATA['ash']['rules'][65536] = _ASH_RULES_ROMANIAN
6218 1
BMDATA['ash']['rules'][131072] = _ASH_RULES_RUSSIAN
6219 1
BMDATA['ash']['rules'][262144] = _ASH_RULES_SPANISH
6220
6221
6222
if __name__ == '__main__':
6223
    import doctest
6224
6225
    doctest.testmod()
6226