Completed
Branch master (78a222)
by Chris
14:36
created

abydos.phonetic._fr.henry_early()   F

Complexity

Conditions 56

Size

Total Lines 225
Code Lines 159

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 100
CRAP Score 56

Importance

Changes 0
Metric Value
eloc 159
dl 0
loc 225
ccs 100
cts 100
cp 1
rs 0
c 0
b 0
f 0
cc 56
nop 2
crap 56

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like abydos.phonetic._fr.henry_early() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._fr.
20
21
The phonetic._fr module implements phonetic algorithms intended for French,
22
including:
23
24
    - FONEM
25
    - an early version of Henry Code
26
"""
27
28 1
from __future__ import unicode_literals
29
30 1
from re import compile as re_compile
31 1
from unicodedata import normalize as unicode_normalize
32
33 1
from six import text_type
34
35 1
__all__ = ['fonem', 'henry_early']
36
37
38 1
def fonem(word):
39
    """Return the FONEM code of a word.
40
41
    FONEM is a phonetic algorithm designed for French (particularly surnames in
42
    Saguenay, Canada), defined in :cite:`Bouchard:1981`.
43
44
    Guillaume Plique's Javascript implementation :cite:`Plique:2018` at
45
    https://github.com/Yomguithereal/talisman/blob/master/src/phonetics/french/fonem.js
46
    was also consulted for this implementation.
47
48
    :param str word: the word to transform
49
    :returns: the FONEM code
50
    :rtype: str
51
52
    >>> fonem('Marchand')
53
    'MARCHEN'
54
    >>> fonem('Beaulieu')
55
    'BOLIEU'
56
    >>> fonem('Beaumont')
57
    'BOMON'
58
    >>> fonem('Legrand')
59
    'LEGREN'
60
    >>> fonem('Pelletier')
61
    'PELETIER'
62
    """
63
    # I don't see a sane way of doing this without regexps :(
64 1
    rule_table = {
65
        # Vowels & groups of vowels
66
        'V-1': (re_compile('E?AU'), 'O'),
67
        'V-2,5': (re_compile('(E?AU|O)L[TX]$'), 'O'),
68
        'V-3,4': (re_compile('E?AU[TX]$'), 'O'),
69
        'V-6': (re_compile('E?AUL?D$'), 'O'),
70
        'V-7': (re_compile(r'(?<!G)AY$'), 'E'),
71
        'V-8': (re_compile('EUX$'), 'EU'),
72
        'V-9': (re_compile('EY(?=$|[BCDFGHJKLMNPQRSTVWXZ])'), 'E'),
73
        'V-10': ('Y', 'I'),
74
        'V-11': (re_compile('(?<=[AEIOUY])I(?=[AEIOUY])'), 'Y'),
75
        'V-12': (re_compile('(?<=[AEIOUY])ILL'), 'Y'),
76
        'V-13': (re_compile('OU(?=[AEOU]|I(?!LL))'), 'W'),
77
        'V-14': (re_compile(r'([AEIOUY])(?=\1)'), ''),
78
        # Nasal vowels
79
        'V-15': (re_compile('[AE]M(?=[BCDFGHJKLMPQRSTVWXZ])(?!$)'), 'EN'),
80
        'V-16': (re_compile('OM(?=[BCDFGHJKLMPQRSTVWXZ])'), 'ON'),
81
        'V-17': (re_compile('AN(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'EN'),
82
        'V-18': (re_compile('(AI[MN]|EIN)(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'IN'),
83
        'V-19': (re_compile('B(O|U|OU)RNE?$'), 'BURN'),
84
        'V-20': (
85
            re_compile(
86
                '(^IM|(?<=[BCDFGHJKLMNPQRSTVWXZ])'
87
                + 'IM(?=[BCDFGHJKLMPQRSTVWXZ]))'
88
            ),
89
            'IN',
90
        ),
91
        # Consonants and groups of consonants
92
        'C-1': ('BV', 'V'),
93
        'C-2': (re_compile('(?<=[AEIOUY])C(?=[EIY])'), 'SS'),
94
        'C-3': (re_compile('(?<=[BDFGHJKLMNPQRSTVWZ])C(?=[EIY])'), 'S'),
95
        'C-4': (re_compile('^C(?=[EIY])'), 'S'),
96
        'C-5': (re_compile('^C(?=[OUA])'), 'K'),
97
        'C-6': (re_compile('(?<=[AEIOUY])C$'), 'K'),
98
        'C-7': (re_compile('C(?=[BDFGJKLMNPQRSTVWXZ])'), 'K'),
99
        'C-8': (re_compile('CC(?=[AOU])'), 'K'),
100
        'C-9': (re_compile('CC(?=[EIY])'), 'X'),
101
        'C-10': (re_compile('G(?=[EIY])'), 'J'),
102
        'C-11': (re_compile('GA(?=I?[MN])'), 'G#'),
103
        'C-12': (re_compile('GE(O|AU)'), 'JO'),
104
        'C-13': (re_compile('GNI(?=[AEIOUY])'), 'GN'),
105
        'C-14': (re_compile('(?<![PCS])H'), ''),
106
        'C-15': ('JEA', 'JA'),
107
        'C-16': (re_compile('^MAC(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'MA#'),
108
        'C-17': (re_compile('^MC'), 'MA#'),
109
        'C-18': ('PH', 'F'),
110
        'C-19': ('QU', 'K'),
111
        'C-20': (re_compile('^SC(?=[EIY])'), 'S'),
112
        'C-21': (re_compile('(?<=.)SC(?=[EIY])'), 'SS'),
113
        'C-22': (re_compile('(?<=.)SC(?=[AOU])'), 'SK'),
114
        'C-23': ('SH', 'CH'),
115
        'C-24': (re_compile('TIA$'), 'SSIA'),
116
        'C-25': (re_compile('(?<=[AIOUY])W'), ''),
117
        'C-26': (re_compile('X[CSZ]'), 'X'),
118
        'C-27': (
119
            re_compile(
120
                '(?<=[AEIOUY])Z|(?<=[BCDFGHJKLMNPQRSTVWXZ])'
121
                + 'Z(?=[BCDFGHJKLMNPQRSTVWXZ])'
122
            ),
123
            'S',
124
        ),
125
        'C-28': (re_compile(r'([BDFGHJKMNPQRTVWXZ])\1'), r'\1'),
126
        'C-28a': (re_compile('CC(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'C'),
127
        'C-28b': (re_compile('((?<=[BCDFGHJKLMNPQRSTVWXZ])|^)SS'), 'S'),
128
        'C-28bb': (re_compile('SS(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'S'),
129
        'C-28c': (re_compile('((?<=[^I])|^)LL'), 'L'),
130
        'C-28d': (re_compile('ILE$'), 'ILLE'),
131
        'C-29': (
132
            re_compile(
133
                '(ILS|[CS]H|[MN]P|R[CFKLNSX])$|([BCDFGHJKL'
134
                + 'MNPQRSTVWXZ])[BCDFGHJKLMNPQRSTVWXZ]$'
135
            ),
136
            lambda m: (m.group(1) or '') + (m.group(2) or ''),
137
        ),
138
        'C-30,32': (re_compile('^(SA?INT?|SEI[NM]|CINQ?|ST)(?!E)-?'), 'ST-'),
139
        'C-31,33': (re_compile('^(SAINTE|STE)-?'), 'STE-'),
140
        # Rules to undo rule bleeding prevention in C-11, C-16, C-17
141
        'C-34': ('G#', 'GA'),
142
        'C-35': ('MA#', 'MAC'),
143
    }
144 1
    rule_order = [
145
        'V-14',
146
        'C-28',
147
        'C-28a',
148
        'C-28b',
149
        'C-28bb',
150
        'C-28c',
151
        'C-28d',
152
        'C-12',
153
        'C-8',
154
        'C-9',
155
        'C-10',
156
        'C-16',
157
        'C-17',
158
        'C-2',
159
        'C-3',
160
        'C-7',
161
        'V-2,5',
162
        'V-3,4',
163
        'V-6',
164
        'V-1',
165
        'C-14',
166
        'C-31,33',
167
        'C-30,32',
168
        'C-11',
169
        'V-15',
170
        'V-17',
171
        'V-18',
172
        'V-7',
173
        'V-8',
174
        'V-9',
175
        'V-10',
176
        'V-11',
177
        'V-12',
178
        'V-13',
179
        'V-16',
180
        'V-19',
181
        'V-20',
182
        'C-1',
183
        'C-4',
184
        'C-5',
185
        'C-6',
186
        'C-13',
187
        'C-15',
188
        'C-18',
189
        'C-19',
190
        'C-20',
191
        'C-21',
192
        'C-22',
193
        'C-23',
194
        'C-24',
195
        'C-25',
196
        'C-26',
197
        'C-27',
198
        'C-29',
199
        'V-14',
200
        'C-28',
201
        'C-28a',
202
        'C-28b',
203
        'C-28bb',
204
        'C-28c',
205
        'C-28d',
206
        'C-34',
207
        'C-35',
208
    ]
209
210
    # normalize, upper-case, and filter non-French letters
211 1
    word = unicode_normalize('NFKD', text_type(word.upper()))
212 1
    word = word.translate({198: 'AE', 338: 'OE'})
213 1
    word = ''.join(
214
        c
215
        for c in word
216
        if c
217
        in {
218
            'A',
219
            'B',
220
            'C',
221
            'D',
222
            'E',
223
            'F',
224
            'G',
225
            'H',
226
            'I',
227
            'J',
228
            'K',
229
            'L',
230
            'M',
231
            'N',
232
            'O',
233
            'P',
234
            'Q',
235
            'R',
236
            'S',
237
            'T',
238
            'U',
239
            'V',
240
            'W',
241
            'X',
242
            'Y',
243
            'Z',
244
            '-',
245
        }
246
    )
247
248 1
    for rule in rule_order:
249 1
        regex, repl = rule_table[rule]
250 1
        if isinstance(regex, text_type):
251 1
            word = word.replace(regex, repl)
252
        else:
253 1
            word = regex.sub(repl, word)
254
255 1
    return word
256
257
258 1
def henry_early(word, max_length=3):
259
    """Calculate the early version of the Henry code for a word.
260
261
    The early version of Henry coding is given in :cite:`Legare:1972`. This is
262
    different from the later version defined in :cite:`Henry:1976`.
263
264
    :param str word: the word to transform
265
    :param int max_length: the length of the code returned (defaults to 3)
266
    :returns: the early Henry code
267
    :rtype: str
268
269
    >>> henry_early('Marchand')
270
    'MRC'
271
    >>> henry_early('Beaulieu')
272
    'BL'
273
    >>> henry_early('Beaumont')
274
    'BM'
275
    >>> henry_early('Legrand')
276
    'LGR'
277
    >>> henry_early('Pelletier')
278
    'PLT'
279
    """
280 1
    _cons = {
281
        'B',
282
        'C',
283
        'D',
284
        'F',
285
        'G',
286
        'H',
287
        'J',
288
        'K',
289
        'L',
290
        'M',
291
        'N',
292
        'P',
293
        'Q',
294
        'R',
295
        'S',
296
        'T',
297
        'V',
298
        'W',
299
        'X',
300
        'Z',
301
    }
302 1
    _vows = {'A', 'E', 'I', 'O', 'U', 'Y'}
303 1
    _diph = {
304
        'AI': 'E',
305
        'AY': 'E',
306
        'EI': 'E',
307
        'AU': 'O',
308
        'OI': 'O',
309
        'OU': 'O',
310
        'EU': 'U',
311
    }
312
    # _unaltered = {'B', 'D', 'F', 'J', 'K', 'L', 'M', 'N', 'R', 'T', 'V'}
313 1
    _simple = {'W': 'V', 'X': 'S', 'Z': 'S'}
314
315 1
    word = unicode_normalize('NFKD', text_type(word.upper()))
316 1
    word = ''.join(
317
        c
318
        for c in word
319
        if c
320
        in {
321
            'A',
322
            'B',
323
            'C',
324
            'D',
325
            'E',
326
            'F',
327
            'G',
328
            'H',
329
            'I',
330
            'J',
331
            'K',
332
            'L',
333
            'M',
334
            'N',
335
            'O',
336
            'P',
337
            'Q',
338
            'R',
339
            'S',
340
            'T',
341
            'U',
342
            'V',
343
            'W',
344
            'X',
345
            'Y',
346
            'Z',
347
        }
348
    )
349
350 1
    if not word:
351 1
        return ''
352
353
    # Rule Ia seems to be covered entirely in II
354
355
    # Rule Ib
356 1
    if word[0] in _vows:
357
        # Ib1
358 1
        if (word[1:2] in _cons - {'M', 'N'} and word[2:3] in _cons) or (
359
            word[1:2] in _cons and word[2:3] not in _cons
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
360
        ):
361 1
            if word[0] == 'Y':
362 1
                word = 'I' + word[1:]
363
        # Ib2
364 1
        elif word[1:2] in {'M', 'N'} and word[2:3] in _cons:
365 1
            if word[0] == 'E':
366 1
                word = 'A' + word[1:]
367 1
            elif word[0] in {'I', 'U', 'Y'}:
368 1
                word = 'E' + word[1:]
369
        # Ib3
370 1
        elif word[:2] in _diph:
371 1
            word = _diph[word[:2]] + word[2:]
372
        # Ib4
373 1
        elif word[1:2] in _vows and word[0] == 'Y':
374 1
            word = 'I' + word[1:]
375
376 1
    code = ''
377 1
    skip = 0
378
379
    # Rule II
380 1
    for pos, char in enumerate(word):
381 1
        nxch = word[pos + 1 : pos + 2]
382 1
        prev = word[pos - 1 : pos]
383
384 1
        if skip:
385 1
            skip -= 1
386 1
        elif char in _vows:
387 1
            code += char
388
        # IIc
389 1
        elif char == nxch:
390 1
            skip = 1
391 1
            code += char
392 1
        elif word[pos : pos + 2] in {'CQ', 'DT', 'SC'}:
393 1
            continue
394
        # IIb
395 1
        elif char in _simple:
396 1
            code += _simple[char]
397 1
        elif char in {'C', 'G', 'P', 'Q', 'S'}:
398 1
            if char == 'C':
399 1
                if nxch in {'A', 'O', 'U', 'L', 'R'}:
400 1
                    code += 'K'
401 1
                elif nxch in {'E', 'I', 'Y'}:
402 1
                    code += 'S'
403 1
                elif nxch == 'H':
404 1
                    if word[pos + 2 : pos + 3] in _vows:
405 1
                        code += 'C'
406
                    else:  # CHR, CHL, etc.
407 1
                        code += 'K'
408
                else:
409 1
                    code += 'C'
410 1
            elif char == 'G':
411 1
                if nxch in {'A', 'O', 'U', 'L', 'R'}:
412 1
                    code += 'G'
413 1
                elif nxch in {'E', 'I', 'Y'}:
414 1
                    code += 'J'
415 1
                elif nxch == 'N':
416 1
                    code += 'N'
417 1
            elif char == 'P':
418 1
                if nxch != 'H':
419 1
                    code += 'P'
420
                else:
421 1
                    code += 'F'
422 1
            elif char == 'Q':
423 1
                if word[pos + 1 : pos + 3] in {'UE', 'UI', 'UY'}:
424 1
                    code += 'G'
425
                else:  # QUA, QUO, etc.
426 1
                    code += 'K'
427
            else:  # S...
428 1
                if word[pos : pos + 6] == 'SAINTE':
429 1
                    code += 'X'
430 1
                    skip = 5
431 1
                elif word[pos : pos + 5] == 'SAINT':
432 1
                    code += 'X'
433 1
                    skip = 4
434 1
                elif word[pos : pos + 3] == 'STE':
435 1
                    code += 'X'
436 1
                    skip = 2
437 1
                elif word[pos : pos + 2] == 'ST':
438 1
                    code += 'X'
439 1
                    skip = 1
440 1
                elif nxch in _cons:
441 1
                    continue
442
                else:
443 1
                    code += 'S'
444
        # IId
445 1
        elif char == 'H' and prev in _cons:
446 1
            continue
447 1
        elif char in _cons - {'L', 'R'} and nxch in _cons - {'L', 'R'}:
448 1
            continue
449 1
        elif char == 'L' and nxch in {'M', 'N'}:
450 1
            continue
451 1
        elif char in {'M', 'N'} and prev in _vows and nxch in _cons:
452 1
            continue
453
        # IIa
454
        else:
455 1
            code += char
456
457
    # IIe1
458 1
    if code[-4:] in {'AULT', 'EULT', 'OULT'}:
459 1
        code = code[:-2]
460
    # The following are blocked by rules above
461
    # elif code[-4:-3] in _vows and code[-3:] == 'MPS':
462
    #    code = code[:-3]
463
    # elif code[-3:-2] in _vows and code[-2:] in {'MB', 'MP', 'ND',
464
    #                                             'NS', 'NT'}:
465
    #    code = code[:-2]
466 1
    elif code[-2:-1] == 'R' and code[-1:] in _cons:
467 1
        code = code[:-1]
468
    # IIe2
469 1
    elif code[-2:-1] in _vows and code[-1:] in {'D', 'M', 'N', 'S', 'T'}:
470 1
        code = code[:-1]
471 1
    elif code[-2:] == 'ER':
472 1
        code = code[:-1]
473
474
    # Drop non-initial vowels
475 1
    code = code[:1] + code[1:].translate(
476
        {65: '', 69: '', 73: '', 79: '', 85: '', 89: ''}
477
    )
478
479 1
    if max_length != -1:
480 1
        code = code[:max_length]
481
482 1
    return code
483
484
485
if __name__ == '__main__':
486
    import doctest
487
488
    doctest.testmod()
489