Test Failed
Push — master ( 64abe2...a464fa )
by Chris
04:02 queued 11s
created

abydos.phonetic.fr.fonem()   B

Complexity

Conditions 4

Size

Total Lines 128
Code Lines 92

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 4
eloc 92
nop 1
dl 0
loc 128
rs 7.2618
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.phonetic.fr.
20
21
The phonetic.fr module implements phonetic algorithms intended for French,
22
including:
23
24
    - FONEM
25
    - an early version of Henry Code
26
"""
27
28
from __future__ import unicode_literals
29
30
from re import compile as re_compile
31
from unicodedata import normalize as unicode_normalize
32
33
from six import text_type
34
35
__all__ = ['fonem', 'henry_early']
36
37
38
def fonem(word):
39
    """Return the FONEM code of a word.
40
41
    FONEM is a phonetic algorithm designed for French (particularly surnames in
42
    Saguenay, Canada), defined in :cite:`Bouchard:1981`.
43
44
    Guillaume Plique's Javascript implementation :cite:`Plique:2018` at
45
    https://github.com/Yomguithereal/talisman/blob/master/src/phonetics/french/fonem.js
46
    was also consulted for this implementation.
47
48
    :param str word: the word to transform
49
    :returns: the FONEM code
50
    :rtype: str
51
52
    >>> fonem('Marchand')
53
    'MARCHEN'
54
    >>> fonem('Beaulieu')
55
    'BOLIEU'
56
    >>> fonem('Beaumont')
57
    'BOMON'
58
    >>> fonem('Legrand')
59
    'LEGREN'
60
    >>> fonem('Pelletier')
61
    'PELETIER'
62
    """
63
    # I don't see a sane way of doing this without regexps :(
64
    rule_table = {
65
        # Vowels & groups of vowels
66
        'V-1':     (re_compile('E?AU'), 'O'),
67
        'V-2,5':   (re_compile('(E?AU|O)L[TX]$'), 'O'),
68
        'V-3,4':   (re_compile('E?AU[TX]$'), 'O'),
69
        'V-6':     (re_compile('E?AUL?D$'), 'O'),
70
        'V-7':     (re_compile(r'(?<!G)AY$'), 'E'),
71
        'V-8':     (re_compile('EUX$'), 'EU'),
72
        'V-9':     (re_compile('EY(?=$|[BCDFGHJKLMNPQRSTVWXZ])'), 'E'),
73
        'V-10':    ('Y', 'I'),
74
        'V-11':    (re_compile('(?<=[AEIOUY])I(?=[AEIOUY])'), 'Y'),
75
        'V-12':    (re_compile('(?<=[AEIOUY])ILL'), 'Y'),
76
        'V-13':    (re_compile('OU(?=[AEOU]|I(?!LL))'), 'W'),
77
        'V-14':    (re_compile(r'([AEIOUY])(?=\1)'), ''),
78
        # Nasal vowels
79
        'V-15':    (re_compile('[AE]M(?=[BCDFGHJKLMPQRSTVWXZ])(?!$)'), 'EN'),
80
        'V-16':    (re_compile('OM(?=[BCDFGHJKLMPQRSTVWXZ])'), 'ON'),
81
        'V-17':    (re_compile('AN(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'EN'),
82
        'V-18':    (re_compile('(AI[MN]|EIN)(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'),
83
                    'IN'),
84
        'V-19':    (re_compile('B(O|U|OU)RNE?$'), 'BURN'),
85
        'V-20':    (re_compile('(^IM|(?<=[BCDFGHJKLMNPQRSTVWXZ])' +
86
                               'IM(?=[BCDFGHJKLMPQRSTVWXZ]))'), 'IN'),
87
        # Consonants and groups of consonants
88
        'C-1':     ('BV', 'V'),
89
        'C-2':     (re_compile('(?<=[AEIOUY])C(?=[EIY])'), 'SS'),
90
        'C-3':     (re_compile('(?<=[BDFGHJKLMNPQRSTVWZ])C(?=[EIY])'), 'S'),
91
        'C-4':     (re_compile('^C(?=[EIY])'), 'S'),
92
        'C-5':     (re_compile('^C(?=[OUA])'), 'K'),
93
        'C-6':     (re_compile('(?<=[AEIOUY])C$'), 'K'),
94
        'C-7':     (re_compile('C(?=[BDFGJKLMNPQRSTVWXZ])'), 'K'),
95
        'C-8':     (re_compile('CC(?=[AOU])'), 'K'),
96
        'C-9':     (re_compile('CC(?=[EIY])'), 'X'),
97
        'C-10':    (re_compile('G(?=[EIY])'), 'J'),
98
        'C-11':    (re_compile('GA(?=I?[MN])'), 'G#'),
99
        'C-12':    (re_compile('GE(O|AU)'), 'JO'),
100
        'C-13':    (re_compile('GNI(?=[AEIOUY])'), 'GN'),
101
        'C-14':    (re_compile('(?<![PCS])H'), ''),
102
        'C-15':    ('JEA', 'JA'),
103
        'C-16':    (re_compile('^MAC(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'MA#'),
104
        'C-17':    (re_compile('^MC'), 'MA#'),
105
        'C-18':    ('PH', 'F'),
106
        'C-19':    ('QU', 'K'),
107
        'C-20':    (re_compile('^SC(?=[EIY])'), 'S'),
108
        'C-21':    (re_compile('(?<=.)SC(?=[EIY])'), 'SS'),
109
        'C-22':    (re_compile('(?<=.)SC(?=[AOU])'), 'SK'),
110
        'C-23':    ('SH', 'CH'),
111
        'C-24':    (re_compile('TIA$'), 'SSIA'),
112
        'C-25':    (re_compile('(?<=[AIOUY])W'), ''),
113
        'C-26':    (re_compile('X[CSZ]'), 'X'),
114
        'C-27':    (re_compile('(?<=[AEIOUY])Z|(?<=[BCDFGHJKLMNPQRSTVWXZ])' +
115
                               'Z(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'S'),
116
        'C-28':    (re_compile(r'([BDFGHJKMNPQRTVWXZ])\1'), r'\1'),
117
        'C-28a':   (re_compile('CC(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'C'),
118
        'C-28b':   (re_compile('((?<=[BCDFGHJKLMNPQRSTVWXZ])|^)SS'), 'S'),
119
        'C-28bb':  (re_compile('SS(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'S'),
120
        'C-28c':   (re_compile('((?<=[^I])|^)LL'), 'L'),
121
        'C-28d':   (re_compile('ILE$'), 'ILLE'),
122
        'C-29':    (re_compile('(ILS|[CS]H|[MN]P|R[CFKLNSX])$|([BCDFGHJKL' +
123
                               'MNPQRSTVWXZ])[BCDFGHJKLMNPQRSTVWXZ]$'),
124
                    lambda m: (m.group(1) or '') + (m.group(2) or '')),
125
        'C-30,32': (re_compile('^(SA?INT?|SEI[NM]|CINQ?|ST)(?!E)-?'), 'ST-'),
126
        'C-31,33': (re_compile('^(SAINTE|STE)-?'), 'STE-'),
127
        # Rules to undo rule bleeding prevention in C-11, C-16, C-17
128
        'C-34':    ('G#', 'GA'),
129
        'C-35':    ('MA#', 'MAC')
130
    }
131
    rule_order = [
132
        'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d',
133
        'C-12',
134
        'C-8', 'C-9', 'C-10',
135
        'C-16', 'C-17', 'C-2', 'C-3', 'C-7',
136
        'V-2,5', 'V-3,4', 'V-6',
137
        'V-1', 'C-14',
138
        'C-31,33', 'C-30,32',
139
        'C-11', 'V-15', 'V-17', 'V-18',
140
        'V-7', 'V-8', 'V-9', 'V-10', 'V-11', 'V-12', 'V-13', 'V-16',
141
        'V-19', 'V-20',
142
        'C-1', 'C-4', 'C-5', 'C-6', 'C-13', 'C-15',
143
        'C-18', 'C-19', 'C-20', 'C-21', 'C-22', 'C-23', 'C-24',
144
        'C-25', 'C-26', 'C-27',
145
        'C-29',
146
        'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d',
147
        'C-34', 'C-35'
148
    ]
149
150
    # normalize, upper-case, and filter non-French letters
151
    word = unicode_normalize('NFKD', text_type(word.upper()))
152
    word = word.translate({198: 'AE', 338: 'OE'})
153
    word = ''.join(c for c in word if c in
154
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
155
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
156
                    'Y', 'Z', '-'})
157
158
    for rule in rule_order:
159
        regex, repl = rule_table[rule]
160
        if isinstance(regex, text_type):
161
            word = word.replace(regex, repl)
162
        else:
163
            word = regex.sub(repl, word)
164
165
    return word
166
167
168
def henry_early(word, max_length=3):
169
    """Calculate the early version of the Henry code for a word.
170
171
    The early version of Henry coding is given in :cite:`Legare:1972`. This is
172
    different from the later version defined in :cite:`Henry:1976`.
173
174
    :param str word: the word to transform
175
    :param int max_length: the length of the code returned (defaults to 3)
176
    :returns: the early Henry code
177
    :rtype: str
178
179
    >>> henry_early('Marchand')
180
    'MRC'
181
    >>> henry_early('Beaulieu')
182
    'BL'
183
    >>> henry_early('Beaumont')
184
    'BM'
185
    >>> henry_early('Legrand')
186
    'LGR'
187
    >>> henry_early('Pelletier')
188
    'PLT'
189
    """
190
    _cons = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q',
191
             'R', 'S', 'T', 'V', 'W', 'X', 'Z'}
192
    _vows = {'A', 'E', 'I', 'O', 'U', 'Y'}
193
    _diph = {'AI': 'E', 'AY': 'E', 'EI': 'E', 'AU': 'O', 'OI': 'O', 'OU': 'O',
194
             'EU': 'U'}
195
    # _unaltered = {'B', 'D', 'F', 'J', 'K', 'L', 'M', 'N', 'R', 'T', 'V'}
196
    _simple = {'W': 'V', 'X': 'S', 'Z': 'S'}
197
198
    word = unicode_normalize('NFKD', text_type(word.upper()))
199
    word = ''.join(c for c in word if c in
200
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
201
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
202
                    'Y', 'Z'})
203
204
    if not word:
205
        return ''
206
207
    # Rule Ia seems to be covered entirely in II
208
209
    # Rule Ib
210
    if word[0] in _vows:
211
        # Ib1
212
        if (((word[1:2] in _cons-{'M', 'N'} and word[2:3] in _cons) or
213
             (word[1:2] in _cons and word[2:3] not in _cons))):
214
            if word[0] == 'Y':
215
                word = 'I'+word[1:]
216
        # Ib2
217
        elif word[1:2] in {'M', 'N'} and word[2:3] in _cons:
218
            if word[0] == 'E':
219
                word = 'A'+word[1:]
220
            elif word[0] in {'I', 'U', 'Y'}:
221
                word = 'E'+word[1:]
222
        # Ib3
223
        elif word[:2] in _diph:
224
            word = _diph[word[:2]]+word[2:]
225
        # Ib4
226
        elif word[1:2] in _vows and word[0] == 'Y':
227
            word = 'I' + word[1:]
228
229
    code = ''
230
    skip = 0
231
232
    # Rule II
233
    for pos, char in enumerate(word):
234
        nxch = word[pos+1:pos+2]
235
        prev = word[pos-1:pos]
236
237
        if skip:
238
            skip -= 1
239
        elif char in _vows:
240
            code += char
241
        # IIc
242
        elif char == nxch:
243
            skip = 1
244
            code += char
245
        elif word[pos:pos+2] in {'CQ', 'DT', 'SC'}:
246
            continue
247
        # IIb
248
        elif char in _simple:
249
            code += _simple[char]
250
        elif char in {'C', 'G', 'P', 'Q', 'S'}:
251
            if char == 'C':
252
                if nxch in {'A', 'O', 'U', 'L', 'R'}:
253
                    code += 'K'
254
                elif nxch in {'E', 'I', 'Y'}:
255
                    code += 'S'
256
                elif nxch == 'H':
257
                    if word[pos+2:pos+3] in _vows:
258
                        code += 'C'
259
                    else:  # CHR, CHL, etc.
260
                        code += 'K'
261
                else:
262
                    code += 'C'
263
            elif char == 'G':
264
                if nxch in {'A', 'O', 'U', 'L', 'R'}:
265
                    code += 'G'
266
                elif nxch in {'E', 'I', 'Y'}:
267
                    code += 'J'
268
                elif nxch == 'N':
269
                    code += 'N'
270
            elif char == 'P':
271
                if nxch != 'H':
272
                    code += 'P'
273
                else:
274
                    code += 'F'
275
            elif char == 'Q':
276
                if word[pos+1:pos+3] in {'UE', 'UI', 'UY'}:
277
                    code += 'G'
278
                else:  # QUA, QUO, etc.
279
                    code += 'K'
280
            else:  # S...
281
                if word[pos:pos+6] == 'SAINTE':
282
                    code += 'X'
283
                    skip = 5
284
                elif word[pos:pos+5] == 'SAINT':
285
                    code += 'X'
286
                    skip = 4
287
                elif word[pos:pos+3] == 'STE':
288
                    code += 'X'
289
                    skip = 2
290
                elif word[pos:pos+2] == 'ST':
291
                    code += 'X'
292
                    skip = 1
293
                elif nxch in _cons:
294
                    continue
295
                else:
296
                    code += 'S'
297
        # IId
298
        elif char == 'H' and prev in _cons:
299
            continue
300
        elif char in _cons-{'L', 'R'} and nxch in _cons-{'L', 'R'}:
301
            continue
302
        elif char == 'L' and nxch in {'M', 'N'}:
303
            continue
304
        elif char in {'M', 'N'} and prev in _vows and nxch in _cons:
305
            continue
306
        # IIa
307
        else:
308
            code += char
309
310
    # IIe1
311
    if code[-4:] in {'AULT', 'EULT', 'OULT'}:
312
        code = code[:-2]
313
    # The following are blocked by rules above
314
    # elif code[-4:-3] in _vows and code[-3:] == 'MPS':
315
    #    code = code[:-3]
316
    # elif code[-3:-2] in _vows and code[-2:] in {'MB', 'MP', 'ND',
317
    #                                             'NS', 'NT'}:
318
    #    code = code[:-2]
319
    elif code[-2:-1] == 'R' and code[-1:] in _cons:
320
        code = code[:-1]
321
    # IIe2
322
    elif code[-2:-1] in _vows and code[-1:] in {'D', 'M', 'N', 'S', 'T'}:
323
        code = code[:-1]
324
    elif code[-2:] == 'ER':
325
        code = code[:-1]
326
327
    # Drop non-initial vowels
328
    code = code[:1]+code[1:].translate({65: '', 69: '', 73: '', 79: '', 85: '',
329
                                        89: ''})
330
331
    if max_length != -1:
332
            code = code[:max_length]
333
334
    return code
335
336
337
if __name__ == '__main__':
338
    import doctest
339
    doctest.testmod()
340