abydos.phonetic.fr.fonem() - Code Metrics - Inspection of "Modularize" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#120)

by Chris

created 2018-10-19 22:19 UTC

abydos.phonetic.fr.fonem() B

↳ Parent: abydos.phonetic.fr

Complexity

Conditions

Size

Total Lines	128
Code Lines	92

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	4
eloc	92
nop	1
dl	0
loc	128
rs	7.2618
c	0
b	0
f	0

How to fix Long Method

# -*- coding: utf-8 -*-

# Copyright 2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic.fr.

The phonetic.fr module implements phonetic algorithms intended for French,
including:

    - FONEM
    - an early version of Henry Code
"""

from __future__ import unicode_literals

from re import compile as re_compile
from unicodedata import normalize as unicode_normalize

from six import text_type

__all__ = ['fonem', 'henry_early']


def fonem(word):
    """Return the FONEM code of a word.

    FONEM is a phonetic algorithm designed for French (particularly surnames in
    Saguenay, Canada), defined in :cite:`Bouchard:1981`.

    Guillaume Plique's Javascript implementation :cite:`Plique:2018` at
    https://github.com/Yomguithereal/talisman/blob/master/src/phonetics/french/fonem.js
    was also consulted for this implementation.

    :param str word: the word to transform
    :returns: the FONEM code
    :rtype: str

    >>> fonem('Marchand')
    'MARCHEN'
    >>> fonem('Beaulieu')
    'BOLIEU'
    >>> fonem('Beaumont')
    'BOMON'
    >>> fonem('Legrand')
    'LEGREN'
    >>> fonem('Pelletier')
    'PELETIER'
    """
    # I don't see a sane way of doing this without regexps :(
    rule_table = {
        # Vowels & groups of vowels
        'V-1':     (re_compile('E?AU'), 'O'),
        'V-2,5':   (re_compile('(E?AU|O)L[TX]$'), 'O'),
        'V-3,4':   (re_compile('E?AU[TX]$'), 'O'),
        'V-6':     (re_compile('E?AUL?D$'), 'O'),
        'V-7':     (re_compile(r'(?<!G)AY$'), 'E'),
        'V-8':     (re_compile('EUX$'), 'EU'),
        'V-9':     (re_compile('EY(?=$|[BCDFGHJKLMNPQRSTVWXZ])'), 'E'),
        'V-10':    ('Y', 'I'),
        'V-11':    (re_compile('(?<=[AEIOUY])I(?=[AEIOUY])'), 'Y'),
        'V-12':    (re_compile('(?<=[AEIOUY])ILL'), 'Y'),
        'V-13':    (re_compile('OU(?=[AEOU]|I(?!LL))'), 'W'),
        'V-14':    (re_compile(r'([AEIOUY])(?=\1)'), ''),
        # Nasal vowels
        'V-15':    (re_compile('[AE]M(?=[BCDFGHJKLMPQRSTVWXZ])(?!$)'), 'EN'),
        'V-16':    (re_compile('OM(?=[BCDFGHJKLMPQRSTVWXZ])'), 'ON'),
        'V-17':    (re_compile('AN(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'EN'),
        'V-18':    (re_compile('(AI[MN]|EIN)(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'),
                    'IN'),
        'V-19':    (re_compile('B(O|U|OU)RNE?$'), 'BURN'),
        'V-20':    (re_compile('(^IM|(?<=[BCDFGHJKLMNPQRSTVWXZ])' +
                               'IM(?=[BCDFGHJKLMPQRSTVWXZ]))'), 'IN'),
        # Consonants and groups of consonants
        'C-1':     ('BV', 'V'),
        'C-2':     (re_compile('(?<=[AEIOUY])C(?=[EIY])'), 'SS'),
        'C-3':     (re_compile('(?<=[BDFGHJKLMNPQRSTVWZ])C(?=[EIY])'), 'S'),
        'C-4':     (re_compile('^C(?=[EIY])'), 'S'),
        'C-5':     (re_compile('^C(?=[OUA])'), 'K'),
        'C-6':     (re_compile('(?<=[AEIOUY])C$'), 'K'),
        'C-7':     (re_compile('C(?=[BDFGJKLMNPQRSTVWXZ])'), 'K'),
        'C-8':     (re_compile('CC(?=[AOU])'), 'K'),
        'C-9':     (re_compile('CC(?=[EIY])'), 'X'),
        'C-10':    (re_compile('G(?=[EIY])'), 'J'),
        'C-11':    (re_compile('GA(?=I?[MN])'), 'G#'),
        'C-12':    (re_compile('GE(O|AU)'), 'JO'),
        'C-13':    (re_compile('GNI(?=[AEIOUY])'), 'GN'),
        'C-14':    (re_compile('(?<![PCS])H'), ''),
        'C-15':    ('JEA', 'JA'),
        'C-16':    (re_compile('^MAC(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'MA#'),
        'C-17':    (re_compile('^MC'), 'MA#'),
        'C-18':    ('PH', 'F'),
        'C-19':    ('QU', 'K'),
        'C-20':    (re_compile('^SC(?=[EIY])'), 'S'),
        'C-21':    (re_compile('(?<=.)SC(?=[EIY])'), 'SS'),
        'C-22':    (re_compile('(?<=.)SC(?=[AOU])'), 'SK'),
        'C-23':    ('SH', 'CH'),
        'C-24':    (re_compile('TIA$'), 'SSIA'),
        'C-25':    (re_compile('(?<=[AIOUY])W'), ''),
        'C-26':    (re_compile('X[CSZ]'), 'X'),
        'C-27':    (re_compile('(?<=[AEIOUY])Z|(?<=[BCDFGHJKLMNPQRSTVWXZ])' +
                               'Z(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'S'),
        'C-28':    (re_compile(r'([BDFGHJKMNPQRTVWXZ])\1'), r'\1'),
        'C-28a':   (re_compile('CC(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'C'),
        'C-28b':   (re_compile('((?<=[BCDFGHJKLMNPQRSTVWXZ])|^)SS'), 'S'),
        'C-28bb':  (re_compile('SS(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'S'),
        'C-28c':   (re_compile('((?<=[^I])|^)LL'), 'L'),
        'C-28d':   (re_compile('ILE$'), 'ILLE'),
        'C-29':    (re_compile('(ILS|[CS]H|[MN]P|R[CFKLNSX])$|([BCDFGHJKL' +
                               'MNPQRSTVWXZ])[BCDFGHJKLMNPQRSTVWXZ]$'),
                    lambda m: (m.group(1) or '') + (m.group(2) or '')),
        'C-30,32': (re_compile('^(SA?INT?|SEI[NM]|CINQ?|ST)(?!E)-?'), 'ST-'),
        'C-31,33': (re_compile('^(SAINTE|STE)-?'), 'STE-'),
        # Rules to undo rule bleeding prevention in C-11, C-16, C-17
        'C-34':    ('G#', 'GA'),
        'C-35':    ('MA#', 'MAC')
    }
    rule_order = [
        'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d',
        'C-12',
        'C-8', 'C-9', 'C-10',
        'C-16', 'C-17', 'C-2', 'C-3', 'C-7',
        'V-2,5', 'V-3,4', 'V-6',
        'V-1', 'C-14',
        'C-31,33', 'C-30,32',
        'C-11', 'V-15', 'V-17', 'V-18',
        'V-7', 'V-8', 'V-9', 'V-10', 'V-11', 'V-12', 'V-13', 'V-16',
        'V-19', 'V-20',
        'C-1', 'C-4', 'C-5', 'C-6', 'C-13', 'C-15',
        'C-18', 'C-19', 'C-20', 'C-21', 'C-22', 'C-23', 'C-24',
        'C-25', 'C-26', 'C-27',
        'C-29',
        'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d',
        'C-34', 'C-35'
    ]

    # normalize, upper-case, and filter non-French letters
    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.translate({198: 'AE', 338: 'OE'})
    word = ''.join(c for c in word if c in
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                    'Y', 'Z', '-'})

    for rule in rule_order:
        regex, repl = rule_table[rule]
        if isinstance(regex, text_type):
            word = word.replace(regex, repl)
        else:
            word = regex.sub(repl, word)

    return word


def henry_early(word, max_length=3):
    """Calculate the early version of the Henry code for a word.

    The early version of Henry coding is given in :cite:`Legare:1972`. This is
    different from the later version defined in :cite:`Henry:1976`.

    :param str word: the word to transform
    :param int max_length: the length of the code returned (defaults to 3)
    :returns: the early Henry code
    :rtype: str

    >>> henry_early('Marchand')
    'MRC'
    >>> henry_early('Beaulieu')
    'BL'
    >>> henry_early('Beaumont')
    'BM'
    >>> henry_early('Legrand')
    'LGR'
    >>> henry_early('Pelletier')
    'PLT'
    """
    _cons = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q',
             'R', 'S', 'T', 'V', 'W', 'X', 'Z'}
    _vows = {'A', 'E', 'I', 'O', 'U', 'Y'}
    _diph = {'AI': 'E', 'AY': 'E', 'EI': 'E', 'AU': 'O', 'OI': 'O', 'OU': 'O',
             'EU': 'U'}
    # _unaltered = {'B', 'D', 'F', 'J', 'K', 'L', 'M', 'N', 'R', 'T', 'V'}
    _simple = {'W': 'V', 'X': 'S', 'Z': 'S'}

    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = ''.join(c for c in word if c in
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                    'Y', 'Z'})

    if not word:
        return ''

    # Rule Ia seems to be covered entirely in II

    # Rule Ib
    if word[0] in _vows:
        # Ib1
        if (((word[1:2] in _cons-{'M', 'N'} and word[2:3] in _cons) or
             (word[1:2] in _cons and word[2:3] not in _cons))):
            if word[0] == 'Y':
                word = 'I'+word[1:]
        # Ib2
        elif word[1:2] in {'M', 'N'} and word[2:3] in _cons:
            if word[0] == 'E':
                word = 'A'+word[1:]
            elif word[0] in {'I', 'U', 'Y'}:
                word = 'E'+word[1:]
        # Ib3
        elif word[:2] in _diph:
            word = _diph[word[:2]]+word[2:]
        # Ib4
        elif word[1:2] in _vows and word[0] == 'Y':
            word = 'I' + word[1:]

    code = ''
    skip = 0

    # Rule II
    for pos, char in enumerate(word):
        nxch = word[pos+1:pos+2]
        prev = word[pos-1:pos]

        if skip:
            skip -= 1
        elif char in _vows:
            code += char
        # IIc
        elif char == nxch:
            skip = 1
            code += char
        elif word[pos:pos+2] in {'CQ', 'DT', 'SC'}:
            continue
        # IIb
        elif char in _simple:
            code += _simple[char]
        elif char in {'C', 'G', 'P', 'Q', 'S'}:
            if char == 'C':
                if nxch in {'A', 'O', 'U', 'L', 'R'}:
                    code += 'K'
                elif nxch in {'E', 'I', 'Y'}:
                    code += 'S'
                elif nxch == 'H':
                    if word[pos+2:pos+3] in _vows:
                        code += 'C'
                    else:  # CHR, CHL, etc.
                        code += 'K'
                else:
                    code += 'C'
            elif char == 'G':
                if nxch in {'A', 'O', 'U', 'L', 'R'}:
                    code += 'G'
                elif nxch in {'E', 'I', 'Y'}:
                    code += 'J'
                elif nxch == 'N':
                    code += 'N'
            elif char == 'P':
                if nxch != 'H':
                    code += 'P'
                else:
                    code += 'F'
            elif char == 'Q':
                if word[pos+1:pos+3] in {'UE', 'UI', 'UY'}:
                    code += 'G'
                else:  # QUA, QUO, etc.
                    code += 'K'
            else:  # S...
                if word[pos:pos+6] == 'SAINTE':
                    code += 'X'
                    skip = 5
                elif word[pos:pos+5] == 'SAINT':
                    code += 'X'
                    skip = 4
                elif word[pos:pos+3] == 'STE':
                    code += 'X'
                    skip = 2
                elif word[pos:pos+2] == 'ST':
                    code += 'X'
                    skip = 1
                elif nxch in _cons:
                    continue
                else:
                    code += 'S'
        # IId
        elif char == 'H' and prev in _cons:
            continue
        elif char in _cons-{'L', 'R'} and nxch in _cons-{'L', 'R'}:
            continue
        elif char == 'L' and nxch in {'M', 'N'}:
            continue
        elif char in {'M', 'N'} and prev in _vows and nxch in _cons:
            continue
        # IIa
        else:
            code += char

    # IIe1
    if code[-4:] in {'AULT', 'EULT', 'OULT'}:
        code = code[:-2]
    # The following are blocked by rules above
    # elif code[-4:-3] in _vows and code[-3:] == 'MPS':
    #    code = code[:-3]
    # elif code[-3:-2] in _vows and code[-2:] in {'MB', 'MP', 'ND',
    #                                             'NS', 'NT'}:
    #    code = code[:-2]
    elif code[-2:-1] == 'R' and code[-1:] in _cons:
        code = code[:-1]
    # IIe2
    elif code[-2:-1] in _vows and code[-1:] in {'D', 'M', 'N', 'S', 'T'}:
        code = code[:-1]
    elif code[-2:] == 'ER':
        code = code[:-1]

    # Drop non-initial vowels
    code = code[:1]+code[1:].translate({65: '', 69: '', 73: '', 79: '', 85: '',
                                        89: ''})

    if max_length != -1:
            code = code[:max_length]

    return code


if __name__ == '__main__':
    import doctest
    doctest.testmod()


1			# -- coding: utf-8 --
2
3			# Copyright 2018 by Christopher C. Little.
4			# This file is part of Abydos.
5			#
6			# Abydos is free software: you can redistribute it and/or modify
7			# it under the terms of the GNU General Public License as published by
8			# the Free Software Foundation, either version 3 of the License, or
9			# (at your option) any later version.
10			#
11			# Abydos is distributed in the hope that it will be useful,
12			# but WITHOUT ANY WARRANTY; without even the implied warranty of
13			# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14			# GNU General Public License for more details.
15			#
16			# You should have received a copy of the GNU General Public License
17			# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19			"""abydos.phonetic.fr.
20
21			The phonetic.fr module implements phonetic algorithms intended for French,
22			including:
23
24			- FONEM
25			- an early version of Henry Code
26			"""
27
28			from __future__ import unicode_literals
29
30			from re import compile as re_compile
31			from unicodedata import normalize as unicode_normalize
32
33			from six import text_type
34
35			__all__ = ['fonem', 'henry_early']
36
37
38			def fonem(word):
39			"""Return the FONEM code of a word.
40
41			FONEM is a phonetic algorithm designed for French (particularly surnames in
42			Saguenay, Canada), defined in :cite:`Bouchard:1981`.
43
44			Guillaume Plique's Javascript implementation :cite:`Plique:2018` at
45			https://github.com/Yomguithereal/talisman/blob/master/src/phonetics/french/fonem.js
46			was also consulted for this implementation.
47
48			:param str word: the word to transform
49			:returns: the FONEM code
50			:rtype: str
51
52			>>> fonem('Marchand')
53			'MARCHEN'
54			>>> fonem('Beaulieu')
55			'BOLIEU'
56			>>> fonem('Beaumont')
57			'BOMON'
58			>>> fonem('Legrand')
59			'LEGREN'
60			>>> fonem('Pelletier')
61			'PELETIER'
62			"""
63			# I don't see a sane way of doing this without regexps :(
64			rule_table = {
65			# Vowels & groups of vowels
66			'V-1': (re_compile('E?AU'), 'O'),
67			'V-2,5': (re_compile('(E?AU\|O)L[TX]$'), 'O'),
68			'V-3,4': (re_compile('E?AU[TX]$'), 'O'),
69			'V-6': (re_compile('E?AUL?D$'), 'O'),
70			'V-7': (re_compile(r'(?<!G)AY$'), 'E'),
71			'V-8': (re_compile('EUX$'), 'EU'),
72			'V-9': (re_compile('EY(?=$\|[BCDFGHJKLMNPQRSTVWXZ])'), 'E'),
73			'V-10': ('Y', 'I'),
74			'V-11': (re_compile('(?<=[AEIOUY])I(?=[AEIOUY])'), 'Y'),
75			'V-12': (re_compile('(?<=[AEIOUY])ILL'), 'Y'),
76			'V-13': (re_compile('OU(?=[AEOU]\|I(?!LL))'), 'W'),
77			'V-14': (re_compile(r'([AEIOUY])(?=\1)'), ''),
78			# Nasal vowels
79			'V-15': (re_compile('[AE]M(?=[BCDFGHJKLMPQRSTVWXZ])(?!$)'), 'EN'),
80			'V-16': (re_compile('OM(?=[BCDFGHJKLMPQRSTVWXZ])'), 'ON'),
81			'V-17': (re_compile('AN(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'EN'),
82			'V-18': (re_compile('(AI[MN]\|EIN)(?=[BCDFGHJKLMNPQRSTVWXZ]\|$)'),
83			'IN'),
84			'V-19': (re_compile('B(O\|U\|OU)RNE?$'), 'BURN'),
85			'V-20': (re_compile('(^IM\|(?<=[BCDFGHJKLMNPQRSTVWXZ])' +
86			'IM(?=[BCDFGHJKLMPQRSTVWXZ]))'), 'IN'),
87			# Consonants and groups of consonants
88			'C-1': ('BV', 'V'),
89			'C-2': (re_compile('(?<=[AEIOUY])C(?=[EIY])'), 'SS'),
90			'C-3': (re_compile('(?<=[BDFGHJKLMNPQRSTVWZ])C(?=[EIY])'), 'S'),
91			'C-4': (re_compile('^C(?=[EIY])'), 'S'),
92			'C-5': (re_compile('^C(?=[OUA])'), 'K'),
93			'C-6': (re_compile('(?<=[AEIOUY])C$'), 'K'),
94			'C-7': (re_compile('C(?=[BDFGJKLMNPQRSTVWXZ])'), 'K'),
95			'C-8': (re_compile('CC(?=[AOU])'), 'K'),
96			'C-9': (re_compile('CC(?=[EIY])'), 'X'),
97			'C-10': (re_compile('G(?=[EIY])'), 'J'),
98			'C-11': (re_compile('GA(?=I?[MN])'), 'G#'),
99			'C-12': (re_compile('GE(O\|AU)'), 'JO'),
100			'C-13': (re_compile('GNI(?=[AEIOUY])'), 'GN'),
101			'C-14': (re_compile('(?<![PCS])H'), ''),
102			'C-15': ('JEA', 'JA'),
103			'C-16': (re_compile('^MAC(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'MA#'),
104			'C-17': (re_compile('^MC'), 'MA#'),
105			'C-18': ('PH', 'F'),
106			'C-19': ('QU', 'K'),
107			'C-20': (re_compile('^SC(?=[EIY])'), 'S'),
108			'C-21': (re_compile('(?<=.)SC(?=[EIY])'), 'SS'),
109			'C-22': (re_compile('(?<=.)SC(?=[AOU])'), 'SK'),
110			'C-23': ('SH', 'CH'),
111			'C-24': (re_compile('TIA$'), 'SSIA'),
112			'C-25': (re_compile('(?<=[AIOUY])W'), ''),
113			'C-26': (re_compile('X[CSZ]'), 'X'),
114			'C-27': (re_compile('(?<=[AEIOUY])Z\|(?<=[BCDFGHJKLMNPQRSTVWXZ])' +
115			'Z(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'S'),
116			'C-28': (re_compile(r'([BDFGHJKMNPQRTVWXZ])\1'), r'\1'),
117			'C-28a': (re_compile('CC(?=[BCDFGHJKLMNPQRSTVWXZ]\|$)'), 'C'),
118			'C-28b': (re_compile('((?<=[BCDFGHJKLMNPQRSTVWXZ])\|^)SS'), 'S'),
119			'C-28bb': (re_compile('SS(?=[BCDFGHJKLMNPQRSTVWXZ]\|$)'), 'S'),
120			'C-28c': (re_compile('((?<=[^I])\|^)LL'), 'L'),
121			'C-28d': (re_compile('ILE$'), 'ILLE'),
122			'C-29': (re_compile('(ILS\|[CS]H\|[MN]P\|R[CFKLNSX])$\|([BCDFGHJKL' +
123			'MNPQRSTVWXZ])[BCDFGHJKLMNPQRSTVWXZ]$'),
124			lambda m: (m.group(1) or '') + (m.group(2) or '')),
125			'C-30,32': (re_compile('^(SA?INT?\|SEI[NM]\|CINQ?\|ST)(?!E)-?'), 'ST-'),
126			'C-31,33': (re_compile('^(SAINTE\|STE)-?'), 'STE-'),
127			# Rules to undo rule bleeding prevention in C-11, C-16, C-17
128			'C-34': ('G#', 'GA'),
129			'C-35': ('MA#', 'MAC')
130			}
131			rule_order = [
132			'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d',
133			'C-12',
134			'C-8', 'C-9', 'C-10',
135			'C-16', 'C-17', 'C-2', 'C-3', 'C-7',
136			'V-2,5', 'V-3,4', 'V-6',
137			'V-1', 'C-14',
138			'C-31,33', 'C-30,32',
139			'C-11', 'V-15', 'V-17', 'V-18',
140			'V-7', 'V-8', 'V-9', 'V-10', 'V-11', 'V-12', 'V-13', 'V-16',
141			'V-19', 'V-20',
142			'C-1', 'C-4', 'C-5', 'C-6', 'C-13', 'C-15',
143			'C-18', 'C-19', 'C-20', 'C-21', 'C-22', 'C-23', 'C-24',
144			'C-25', 'C-26', 'C-27',
145			'C-29',
146			'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d',
147			'C-34', 'C-35'
148			]
149
150			# normalize, upper-case, and filter non-French letters
151			word = unicode_normalize('NFKD', text_type(word.upper()))
152			word = word.translate({198: 'AE', 338: 'OE'})
153			word = ''.join(c for c in word if c in
154			{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
155			'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
156			'Y', 'Z', '-'})
157
158			for rule in rule_order:
159			regex, repl = rule_table[rule]
160			if isinstance(regex, text_type):
161			word = word.replace(regex, repl)
162			else:
163			word = regex.sub(repl, word)
164
165			return word
166
167
168			def henry_early(word, max_length=3):
169			"""Calculate the early version of the Henry code for a word.
170
171			The early version of Henry coding is given in :cite:`Legare:1972`. This is
172			different from the later version defined in :cite:`Henry:1976`.
173
174			:param str word: the word to transform
175			:param int max_length: the length of the code returned (defaults to 3)
176			:returns: the early Henry code
177			:rtype: str
178
179			>>> henry_early('Marchand')
180			'MRC'
181			>>> henry_early('Beaulieu')
182			'BL'
183			>>> henry_early('Beaumont')
184			'BM'
185			>>> henry_early('Legrand')
186			'LGR'
187			>>> henry_early('Pelletier')
188			'PLT'
189			"""
190			_cons = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q',
191			'R', 'S', 'T', 'V', 'W', 'X', 'Z'}
192			_vows = {'A', 'E', 'I', 'O', 'U', 'Y'}
193			_diph = {'AI': 'E', 'AY': 'E', 'EI': 'E', 'AU': 'O', 'OI': 'O', 'OU': 'O',
194			'EU': 'U'}
195			# _unaltered = {'B', 'D', 'F', 'J', 'K', 'L', 'M', 'N', 'R', 'T', 'V'}
196			_simple = {'W': 'V', 'X': 'S', 'Z': 'S'}
197
198			word = unicode_normalize('NFKD', text_type(word.upper()))
199			word = ''.join(c for c in word if c in
200			{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
201			'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
202			'Y', 'Z'})
203
204			if not word:
205			return ''
206
207			# Rule Ia seems to be covered entirely in II
208
209			# Rule Ib
210			if word[0] in _vows:
211			# Ib1
212			if (((word[1:2] in _cons-{'M', 'N'} and word[2:3] in _cons) or
213			(word[1:2] in _cons and word[2:3] not in _cons))):
214			if word[0] == 'Y':
215			word = 'I'+word[1:]
216			# Ib2
217			elif word[1:2] in {'M', 'N'} and word[2:3] in _cons:
218			if word[0] == 'E':
219			word = 'A'+word[1:]
220			elif word[0] in {'I', 'U', 'Y'}:
221			word = 'E'+word[1:]
222			# Ib3
223			elif word[:2] in _diph:
224			word = _diph[word[:2]]+word[2:]
225			# Ib4
226			elif word[1:2] in _vows and word[0] == 'Y':
227			word = 'I' + word[1:]
228
229			code = ''
230			skip = 0
231
232			# Rule II
233			for pos, char in enumerate(word):
234			nxch = word[pos+1:pos+2]
235			prev = word[pos-1:pos]
236
237			if skip:
238			skip -= 1
239			elif char in _vows:
240			code += char
241			# IIc
242			elif char == nxch:
243			skip = 1
244			code += char
245			elif word[pos:pos+2] in {'CQ', 'DT', 'SC'}:
246			continue
247			# IIb
248			elif char in _simple:
249			code += _simple[char]
250			elif char in {'C', 'G', 'P', 'Q', 'S'}:
251			if char == 'C':
252			if nxch in {'A', 'O', 'U', 'L', 'R'}:
253			code += 'K'
254			elif nxch in {'E', 'I', 'Y'}:
255			code += 'S'
256			elif nxch == 'H':
257			if word[pos+2:pos+3] in _vows:
258			code += 'C'
259			else: # CHR, CHL, etc.
260			code += 'K'
261			else:
262			code += 'C'
263			elif char == 'G':
264			if nxch in {'A', 'O', 'U', 'L', 'R'}:
265			code += 'G'
266			elif nxch in {'E', 'I', 'Y'}:
267			code += 'J'
268			elif nxch == 'N':
269			code += 'N'
270			elif char == 'P':
271			if nxch != 'H':
272			code += 'P'
273			else:
274			code += 'F'
275			elif char == 'Q':
276			if word[pos+1:pos+3] in {'UE', 'UI', 'UY'}:
277			code += 'G'
278			else: # QUA, QUO, etc.
279			code += 'K'
280			else: # S...
281			if word[pos:pos+6] == 'SAINTE':
282			code += 'X'
283			skip = 5
284			elif word[pos:pos+5] == 'SAINT':
285			code += 'X'
286			skip = 4
287			elif word[pos:pos+3] == 'STE':
288			code += 'X'
289			skip = 2
290			elif word[pos:pos+2] == 'ST':
291			code += 'X'
292			skip = 1
293			elif nxch in _cons:
294			continue
295			else:
296			code += 'S'
297			# IId
298			elif char == 'H' and prev in _cons:
299			continue
300			elif char in _cons-{'L', 'R'} and nxch in _cons-{'L', 'R'}:
301			continue
302			elif char == 'L' and nxch in {'M', 'N'}:
303			continue
304			elif char in {'M', 'N'} and prev in _vows and nxch in _cons:
305			continue
306			# IIa
307			else:
308			code += char
309
310			# IIe1
311			if code[-4:] in {'AULT', 'EULT', 'OULT'}:
312			code = code[:-2]
313			# The following are blocked by rules above
314			# elif code[-4:-3] in _vows and code[-3:] == 'MPS':
315			# code = code[:-3]
316			# elif code[-3:-2] in _vows and code[-2:] in {'MB', 'MP', 'ND',
317			# 'NS', 'NT'}:
318			# code = code[:-2]
319			elif code[-2:-1] == 'R' and code[-1:] in _cons:
320			code = code[:-1]
321			# IIe2
322			elif code[-2:-1] in _vows and code[-1:] in {'D', 'M', 'N', 'S', 'T'}:
323			code = code[:-1]
324			elif code[-2:] == 'ER':
325			code = code[:-1]
326
327			# Drop non-initial vowels
328			code = code[:1]+code[1:].translate({65: '', 69: '', 73: '', 79: '', 85: '',
329			89: ''})
330
331			if max_length != -1:
332			code = code[:max_length]
333
334			return code
335
336
337			if __name__ == '__main__':
338			import doctest
339			doctest.testmod()
340

chrislit / abydos

Pull Request — master (#120)

abydos.phonetic.fr.fonem() B

Complexity

Size

Duplication

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like