abydos.phonetic._fr.fonem() - Code Metrics - Inspection of "78a222a9f7d8976f6744d263e3d6d01a2a991c27" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Branch — master (78a222)

by Chris

created 2018-10-26 11:30 UTC

abydos.phonetic._fr.fonem() B

↳ Parent: abydos.phonetic._fr

Complexity

Conditions

Size

Total Lines	218
Code Lines	174

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	12
CRAP Score	4

Importance

Changes

Metric	Value
eloc	174
dl	0
loc	218
ccs	12
cts	12
cp	1
rs	7
c	0
b	0
f	0
cc	4
nop	1
crap	4

How to fix Long Method

# -*- coding: utf-8 -*-

# Copyright 2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._fr.

The phonetic._fr module implements phonetic algorithms intended for French,
including:

    - FONEM
    - an early version of Henry Code
"""

from __future__ import unicode_literals

from re import compile as re_compile
from unicodedata import normalize as unicode_normalize

from six import text_type

__all__ = ['fonem', 'henry_early']


def fonem(word):
    """Return the FONEM code of a word.

    FONEM is a phonetic algorithm designed for French (particularly surnames in
    Saguenay, Canada), defined in :cite:`Bouchard:1981`.

    Guillaume Plique's Javascript implementation :cite:`Plique:2018` at
    https://github.com/Yomguithereal/talisman/blob/master/src/phonetics/french/fonem.js
    was also consulted for this implementation.

    :param str word: the word to transform
    :returns: the FONEM code
    :rtype: str

    >>> fonem('Marchand')
    'MARCHEN'
    >>> fonem('Beaulieu')
    'BOLIEU'
    >>> fonem('Beaumont')
    'BOMON'
    >>> fonem('Legrand')
    'LEGREN'
    >>> fonem('Pelletier')
    'PELETIER'
    """
    # I don't see a sane way of doing this without regexps :(
    rule_table = {
        # Vowels & groups of vowels
        'V-1': (re_compile('E?AU'), 'O'),
        'V-2,5': (re_compile('(E?AU|O)L[TX]$'), 'O'),
        'V-3,4': (re_compile('E?AU[TX]$'), 'O'),
        'V-6': (re_compile('E?AUL?D$'), 'O'),
        'V-7': (re_compile(r'(?<!G)AY$'), 'E'),
        'V-8': (re_compile('EUX$'), 'EU'),
        'V-9': (re_compile('EY(?=$|[BCDFGHJKLMNPQRSTVWXZ])'), 'E'),
        'V-10': ('Y', 'I'),
        'V-11': (re_compile('(?<=[AEIOUY])I(?=[AEIOUY])'), 'Y'),
        'V-12': (re_compile('(?<=[AEIOUY])ILL'), 'Y'),
        'V-13': (re_compile('OU(?=[AEOU]|I(?!LL))'), 'W'),
        'V-14': (re_compile(r'([AEIOUY])(?=\1)'), ''),
        # Nasal vowels
        'V-15': (re_compile('[AE]M(?=[BCDFGHJKLMPQRSTVWXZ])(?!$)'), 'EN'),
        'V-16': (re_compile('OM(?=[BCDFGHJKLMPQRSTVWXZ])'), 'ON'),
        'V-17': (re_compile('AN(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'EN'),
        'V-18': (re_compile('(AI[MN]|EIN)(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'IN'),
        'V-19': (re_compile('B(O|U|OU)RNE?$'), 'BURN'),
        'V-20': (
            re_compile(
                '(^IM|(?<=[BCDFGHJKLMNPQRSTVWXZ])'
                + 'IM(?=[BCDFGHJKLMPQRSTVWXZ]))'
            ),
            'IN',
        ),
        # Consonants and groups of consonants
        'C-1': ('BV', 'V'),
        'C-2': (re_compile('(?<=[AEIOUY])C(?=[EIY])'), 'SS'),
        'C-3': (re_compile('(?<=[BDFGHJKLMNPQRSTVWZ])C(?=[EIY])'), 'S'),
        'C-4': (re_compile('^C(?=[EIY])'), 'S'),
        'C-5': (re_compile('^C(?=[OUA])'), 'K'),
        'C-6': (re_compile('(?<=[AEIOUY])C$'), 'K'),
        'C-7': (re_compile('C(?=[BDFGJKLMNPQRSTVWXZ])'), 'K'),
        'C-8': (re_compile('CC(?=[AOU])'), 'K'),
        'C-9': (re_compile('CC(?=[EIY])'), 'X'),
        'C-10': (re_compile('G(?=[EIY])'), 'J'),
        'C-11': (re_compile('GA(?=I?[MN])'), 'G#'),
        'C-12': (re_compile('GE(O|AU)'), 'JO'),
        'C-13': (re_compile('GNI(?=[AEIOUY])'), 'GN'),
        'C-14': (re_compile('(?<![PCS])H'), ''),
        'C-15': ('JEA', 'JA'),
        'C-16': (re_compile('^MAC(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'MA#'),
        'C-17': (re_compile('^MC'), 'MA#'),
        'C-18': ('PH', 'F'),
        'C-19': ('QU', 'K'),
        'C-20': (re_compile('^SC(?=[EIY])'), 'S'),
        'C-21': (re_compile('(?<=.)SC(?=[EIY])'), 'SS'),
        'C-22': (re_compile('(?<=.)SC(?=[AOU])'), 'SK'),
        'C-23': ('SH', 'CH'),
        'C-24': (re_compile('TIA$'), 'SSIA'),
        'C-25': (re_compile('(?<=[AIOUY])W'), ''),
        'C-26': (re_compile('X[CSZ]'), 'X'),
        'C-27': (
            re_compile(
                '(?<=[AEIOUY])Z|(?<=[BCDFGHJKLMNPQRSTVWXZ])'
                + 'Z(?=[BCDFGHJKLMNPQRSTVWXZ])'
            ),
            'S',
        ),
        'C-28': (re_compile(r'([BDFGHJKMNPQRTVWXZ])\1'), r'\1'),
        'C-28a': (re_compile('CC(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'C'),
        'C-28b': (re_compile('((?<=[BCDFGHJKLMNPQRSTVWXZ])|^)SS'), 'S'),
        'C-28bb': (re_compile('SS(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'S'),
        'C-28c': (re_compile('((?<=[^I])|^)LL'), 'L'),
        'C-28d': (re_compile('ILE$'), 'ILLE'),
        'C-29': (
            re_compile(
                '(ILS|[CS]H|[MN]P|R[CFKLNSX])$|([BCDFGHJKL'
                + 'MNPQRSTVWXZ])[BCDFGHJKLMNPQRSTVWXZ]$'
            ),
            lambda m: (m.group(1) or '') + (m.group(2) or ''),
        ),
        'C-30,32': (re_compile('^(SA?INT?|SEI[NM]|CINQ?|ST)(?!E)-?'), 'ST-'),
        'C-31,33': (re_compile('^(SAINTE|STE)-?'), 'STE-'),
        # Rules to undo rule bleeding prevention in C-11, C-16, C-17
        'C-34': ('G#', 'GA'),
        'C-35': ('MA#', 'MAC'),
    }
    rule_order = [
        'V-14',
        'C-28',
        'C-28a',
        'C-28b',
        'C-28bb',
        'C-28c',
        'C-28d',
        'C-12',
        'C-8',
        'C-9',
        'C-10',
        'C-16',
        'C-17',
        'C-2',
        'C-3',
        'C-7',
        'V-2,5',
        'V-3,4',
        'V-6',
        'V-1',
        'C-14',
        'C-31,33',
        'C-30,32',
        'C-11',
        'V-15',
        'V-17',
        'V-18',
        'V-7',
        'V-8',
        'V-9',
        'V-10',
        'V-11',
        'V-12',
        'V-13',
        'V-16',
        'V-19',
        'V-20',
        'C-1',
        'C-4',
        'C-5',
        'C-6',
        'C-13',
        'C-15',
        'C-18',
        'C-19',
        'C-20',
        'C-21',
        'C-22',
        'C-23',
        'C-24',
        'C-25',
        'C-26',
        'C-27',
        'C-29',
        'V-14',
        'C-28',
        'C-28a',
        'C-28b',
        'C-28bb',
        'C-28c',
        'C-28d',
        'C-34',
        'C-35',
    ]

    # normalize, upper-case, and filter non-French letters
    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.translate({198: 'AE', 338: 'OE'})
    word = ''.join(
        c
        for c in word
        if c
        in {
            'A',
            'B',
            'C',
            'D',
            'E',
            'F',
            'G',
            'H',
            'I',
            'J',
            'K',
            'L',
            'M',
            'N',
            'O',
            'P',
            'Q',
            'R',
            'S',
            'T',
            'U',
            'V',
            'W',
            'X',
            'Y',
            'Z',
            '-',
        }
    )

    for rule in rule_order:
        regex, repl = rule_table[rule]
        if isinstance(regex, text_type):
            word = word.replace(regex, repl)
        else:
            word = regex.sub(repl, word)

    return word


def henry_early(word, max_length=3):
    """Calculate the early version of the Henry code for a word.

    The early version of Henry coding is given in :cite:`Legare:1972`. This is
    different from the later version defined in :cite:`Henry:1976`.

    :param str word: the word to transform
    :param int max_length: the length of the code returned (defaults to 3)
    :returns: the early Henry code
    :rtype: str

    >>> henry_early('Marchand')
    'MRC'
    >>> henry_early('Beaulieu')
    'BL'
    >>> henry_early('Beaumont')
    'BM'
    >>> henry_early('Legrand')
    'LGR'
    >>> henry_early('Pelletier')
    'PLT'
    """
    _cons = {
        'B',
        'C',
        'D',
        'F',
        'G',
        'H',
        'J',
        'K',
        'L',
        'M',
        'N',
        'P',
        'Q',
        'R',
        'S',
        'T',
        'V',
        'W',
        'X',
        'Z',
    }
    _vows = {'A', 'E', 'I', 'O', 'U', 'Y'}
    _diph = {
        'AI': 'E',
        'AY': 'E',
        'EI': 'E',
        'AU': 'O',
        'OI': 'O',
        'OU': 'O',
        'EU': 'U',
    }
    # _unaltered = {'B', 'D', 'F', 'J', 'K', 'L', 'M', 'N', 'R', 'T', 'V'}
    _simple = {'W': 'V', 'X': 'S', 'Z': 'S'}

    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = ''.join(
        c
        for c in word
        if c
        in {
            'A',
            'B',
            'C',
            'D',
            'E',
            'F',
            'G',
            'H',
            'I',
            'J',
            'K',
            'L',
            'M',
            'N',
            'O',
            'P',
            'Q',
            'R',
            'S',
            'T',
            'U',
            'V',
            'W',
            'X',
            'Y',
            'Z',
        }
    )

    if not word:
        return ''

    # Rule Ia seems to be covered entirely in II

    # Rule Ib
    if word[0] in _vows:
        # Ib1
        if (word[1:2] in _cons - {'M', 'N'} and word[2:3] in _cons) or (
            word[1:2] in _cons and word[2:3] not in _cons

        ):
            if word[0] == 'Y':
                word = 'I' + word[1:]
        # Ib2
        elif word[1:2] in {'M', 'N'} and word[2:3] in _cons:
            if word[0] == 'E':
                word = 'A' + word[1:]
            elif word[0] in {'I', 'U', 'Y'}:
                word = 'E' + word[1:]
        # Ib3
        elif word[:2] in _diph:
            word = _diph[word[:2]] + word[2:]
        # Ib4
        elif word[1:2] in _vows and word[0] == 'Y':
            word = 'I' + word[1:]

    code = ''
    skip = 0

    # Rule II
    for pos, char in enumerate(word):
        nxch = word[pos + 1 : pos + 2]
        prev = word[pos - 1 : pos]

        if skip:
            skip -= 1
        elif char in _vows:
            code += char
        # IIc
        elif char == nxch:
            skip = 1
            code += char
        elif word[pos : pos + 2] in {'CQ', 'DT', 'SC'}:
            continue
        # IIb
        elif char in _simple:
            code += _simple[char]
        elif char in {'C', 'G', 'P', 'Q', 'S'}:
            if char == 'C':
                if nxch in {'A', 'O', 'U', 'L', 'R'}:
                    code += 'K'
                elif nxch in {'E', 'I', 'Y'}:
                    code += 'S'
                elif nxch == 'H':
                    if word[pos + 2 : pos + 3] in _vows:
                        code += 'C'
                    else:  # CHR, CHL, etc.
                        code += 'K'
                else:
                    code += 'C'
            elif char == 'G':
                if nxch in {'A', 'O', 'U', 'L', 'R'}:
                    code += 'G'
                elif nxch in {'E', 'I', 'Y'}:
                    code += 'J'
                elif nxch == 'N':
                    code += 'N'
            elif char == 'P':
                if nxch != 'H':
                    code += 'P'
                else:
                    code += 'F'
            elif char == 'Q':
                if word[pos + 1 : pos + 3] in {'UE', 'UI', 'UY'}:
                    code += 'G'
                else:  # QUA, QUO, etc.
                    code += 'K'
            else:  # S...
                if word[pos : pos + 6] == 'SAINTE':
                    code += 'X'
                    skip = 5
                elif word[pos : pos + 5] == 'SAINT':
                    code += 'X'
                    skip = 4
                elif word[pos : pos + 3] == 'STE':
                    code += 'X'
                    skip = 2
                elif word[pos : pos + 2] == 'ST':
                    code += 'X'
                    skip = 1
                elif nxch in _cons:
                    continue
                else:
                    code += 'S'
        # IId
        elif char == 'H' and prev in _cons:
            continue
        elif char in _cons - {'L', 'R'} and nxch in _cons - {'L', 'R'}:
            continue
        elif char == 'L' and nxch in {'M', 'N'}:
            continue
        elif char in {'M', 'N'} and prev in _vows and nxch in _cons:
            continue
        # IIa
        else:
            code += char

    # IIe1
    if code[-4:] in {'AULT', 'EULT', 'OULT'}:
        code = code[:-2]
    # The following are blocked by rules above
    # elif code[-4:-3] in _vows and code[-3:] == 'MPS':
    #    code = code[:-3]
    # elif code[-3:-2] in _vows and code[-2:] in {'MB', 'MP', 'ND',
    #                                             'NS', 'NT'}:
    #    code = code[:-2]
    elif code[-2:-1] == 'R' and code[-1:] in _cons:
        code = code[:-1]
    # IIe2
    elif code[-2:-1] in _vows and code[-1:] in {'D', 'M', 'N', 'S', 'T'}:
        code = code[:-1]
    elif code[-2:] == 'ER':
        code = code[:-1]

    # Drop non-initial vowels
    code = code[:1] + code[1:].translate(
        {65: '', 69: '', 73: '', 79: '', 85: '', 89: ''}
    )

    if max_length != -1:
        code = code[:max_length]

    return code


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.phonetic._fr.
20
21		The phonetic._fr module implements phonetic algorithms intended for French,
22		including:
23
24		- FONEM
25		- an early version of Henry Code
26		"""
27
28	1	from __future__ import unicode_literals
29
30	1	from re import compile as re_compile
31	1	from unicodedata import normalize as unicode_normalize
32
33	1	from six import text_type
34
35	1	__all__ = ['fonem', 'henry_early']
36
37
38	1	def fonem(word):
39		"""Return the FONEM code of a word.
40
41		FONEM is a phonetic algorithm designed for French (particularly surnames in
42		Saguenay, Canada), defined in :cite:`Bouchard:1981`.
43
44		Guillaume Plique's Javascript implementation :cite:`Plique:2018` at
45		https://github.com/Yomguithereal/talisman/blob/master/src/phonetics/french/fonem.js
46		was also consulted for this implementation.
47
48		:param str word: the word to transform
49		:returns: the FONEM code
50		:rtype: str
51
52		>>> fonem('Marchand')
53		'MARCHEN'
54		>>> fonem('Beaulieu')
55		'BOLIEU'
56		>>> fonem('Beaumont')
57		'BOMON'
58		>>> fonem('Legrand')
59		'LEGREN'
60		>>> fonem('Pelletier')
61		'PELETIER'
62		"""
63		# I don't see a sane way of doing this without regexps :(
64	1	rule_table = {
65		# Vowels & groups of vowels
66		'V-1': (re_compile('E?AU'), 'O'),
67		'V-2,5': (re_compile('(E?AU\|O)L[TX]$'), 'O'),
68		'V-3,4': (re_compile('E?AU[TX]$'), 'O'),
69		'V-6': (re_compile('E?AUL?D$'), 'O'),
70		'V-7': (re_compile(r'(?<!G)AY$'), 'E'),
71		'V-8': (re_compile('EUX$'), 'EU'),
72		'V-9': (re_compile('EY(?=$\|[BCDFGHJKLMNPQRSTVWXZ])'), 'E'),
73		'V-10': ('Y', 'I'),
74		'V-11': (re_compile('(?<=[AEIOUY])I(?=[AEIOUY])'), 'Y'),
75		'V-12': (re_compile('(?<=[AEIOUY])ILL'), 'Y'),
76		'V-13': (re_compile('OU(?=[AEOU]\|I(?!LL))'), 'W'),
77		'V-14': (re_compile(r'([AEIOUY])(?=\1)'), ''),
78		# Nasal vowels
79		'V-15': (re_compile('[AE]M(?=[BCDFGHJKLMPQRSTVWXZ])(?!$)'), 'EN'),
80		'V-16': (re_compile('OM(?=[BCDFGHJKLMPQRSTVWXZ])'), 'ON'),
81		'V-17': (re_compile('AN(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'EN'),
82		'V-18': (re_compile('(AI[MN]\|EIN)(?=[BCDFGHJKLMNPQRSTVWXZ]\|$)'), 'IN'),
83		'V-19': (re_compile('B(O\|U\|OU)RNE?$'), 'BURN'),
84		'V-20': (
85		re_compile(
86		'(^IM\|(?<=[BCDFGHJKLMNPQRSTVWXZ])'
87		+ 'IM(?=[BCDFGHJKLMPQRSTVWXZ]))'
88		),
89		'IN',
90		),
91		# Consonants and groups of consonants
92		'C-1': ('BV', 'V'),
93		'C-2': (re_compile('(?<=[AEIOUY])C(?=[EIY])'), 'SS'),
94		'C-3': (re_compile('(?<=[BDFGHJKLMNPQRSTVWZ])C(?=[EIY])'), 'S'),
95		'C-4': (re_compile('^C(?=[EIY])'), 'S'),
96		'C-5': (re_compile('^C(?=[OUA])'), 'K'),
97		'C-6': (re_compile('(?<=[AEIOUY])C$'), 'K'),
98		'C-7': (re_compile('C(?=[BDFGJKLMNPQRSTVWXZ])'), 'K'),
99		'C-8': (re_compile('CC(?=[AOU])'), 'K'),
100		'C-9': (re_compile('CC(?=[EIY])'), 'X'),
101		'C-10': (re_compile('G(?=[EIY])'), 'J'),
102		'C-11': (re_compile('GA(?=I?[MN])'), 'G#'),
103		'C-12': (re_compile('GE(O\|AU)'), 'JO'),
104		'C-13': (re_compile('GNI(?=[AEIOUY])'), 'GN'),
105		'C-14': (re_compile('(?<![PCS])H'), ''),
106		'C-15': ('JEA', 'JA'),
107		'C-16': (re_compile('^MAC(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'MA#'),
108		'C-17': (re_compile('^MC'), 'MA#'),
109		'C-18': ('PH', 'F'),
110		'C-19': ('QU', 'K'),
111		'C-20': (re_compile('^SC(?=[EIY])'), 'S'),
112		'C-21': (re_compile('(?<=.)SC(?=[EIY])'), 'SS'),
113		'C-22': (re_compile('(?<=.)SC(?=[AOU])'), 'SK'),
114		'C-23': ('SH', 'CH'),
115		'C-24': (re_compile('TIA$'), 'SSIA'),
116		'C-25': (re_compile('(?<=[AIOUY])W'), ''),
117		'C-26': (re_compile('X[CSZ]'), 'X'),
118		'C-27': (
119		re_compile(
120		'(?<=[AEIOUY])Z\|(?<=[BCDFGHJKLMNPQRSTVWXZ])'
121		+ 'Z(?=[BCDFGHJKLMNPQRSTVWXZ])'
122		),
123		'S',
124		),
125		'C-28': (re_compile(r'([BDFGHJKMNPQRTVWXZ])\1'), r'\1'),
126		'C-28a': (re_compile('CC(?=[BCDFGHJKLMNPQRSTVWXZ]\|$)'), 'C'),
127		'C-28b': (re_compile('((?<=[BCDFGHJKLMNPQRSTVWXZ])\|^)SS'), 'S'),
128		'C-28bb': (re_compile('SS(?=[BCDFGHJKLMNPQRSTVWXZ]\|$)'), 'S'),
129		'C-28c': (re_compile('((?<=[^I])\|^)LL'), 'L'),
130		'C-28d': (re_compile('ILE$'), 'ILLE'),
131		'C-29': (
132		re_compile(
133		'(ILS\|[CS]H\|[MN]P\|R[CFKLNSX])$\|([BCDFGHJKL'
134		+ 'MNPQRSTVWXZ])[BCDFGHJKLMNPQRSTVWXZ]$'
135		),
136		lambda m: (m.group(1) or '') + (m.group(2) or ''),
137		),
138		'C-30,32': (re_compile('^(SA?INT?\|SEI[NM]\|CINQ?\|ST)(?!E)-?'), 'ST-'),
139		'C-31,33': (re_compile('^(SAINTE\|STE)-?'), 'STE-'),
140		# Rules to undo rule bleeding prevention in C-11, C-16, C-17
141		'C-34': ('G#', 'GA'),
142		'C-35': ('MA#', 'MAC'),
143		}
144	1	rule_order = [
145		'V-14',
146		'C-28',
147		'C-28a',
148		'C-28b',
149		'C-28bb',
150		'C-28c',
151		'C-28d',
152		'C-12',
153		'C-8',
154		'C-9',
155		'C-10',
156		'C-16',
157		'C-17',
158		'C-2',
159		'C-3',
160		'C-7',
161		'V-2,5',
162		'V-3,4',
163		'V-6',
164		'V-1',
165		'C-14',
166		'C-31,33',
167		'C-30,32',
168		'C-11',
169		'V-15',
170		'V-17',
171		'V-18',
172		'V-7',
173		'V-8',
174		'V-9',
175		'V-10',
176		'V-11',
177		'V-12',
178		'V-13',
179		'V-16',
180		'V-19',
181		'V-20',
182		'C-1',
183		'C-4',
184		'C-5',
185		'C-6',
186		'C-13',
187		'C-15',
188		'C-18',
189		'C-19',
190		'C-20',
191		'C-21',
192		'C-22',
193		'C-23',
194		'C-24',
195		'C-25',
196		'C-26',
197		'C-27',
198		'C-29',
199		'V-14',
200		'C-28',
201		'C-28a',
202		'C-28b',
203		'C-28bb',
204		'C-28c',
205		'C-28d',
206		'C-34',
207		'C-35',
208		]
209
210		# normalize, upper-case, and filter non-French letters
211	1	word = unicode_normalize('NFKD', text_type(word.upper()))
212	1	word = word.translate({198: 'AE', 338: 'OE'})
213	1	word = ''.join(
214		c
215		for c in word
216		if c
217		in {
218		'A',
219		'B',
220		'C',
221		'D',
222		'E',
223		'F',
224		'G',
225		'H',
226		'I',
227		'J',
228		'K',
229		'L',
230		'M',
231		'N',
232		'O',
233		'P',
234		'Q',
235		'R',
236		'S',
237		'T',
238		'U',
239		'V',
240		'W',
241		'X',
242		'Y',
243		'Z',
244		'-',
245		}
246		)
247
248	1	for rule in rule_order:
249	1	regex, repl = rule_table[rule]
250	1	if isinstance(regex, text_type):
251	1	word = word.replace(regex, repl)
252		else:
253	1	word = regex.sub(repl, word)
254
255	1	return word
256
257
258	1	def henry_early(word, max_length=3):
259		"""Calculate the early version of the Henry code for a word.
260
261		The early version of Henry coding is given in :cite:`Legare:1972`. This is
262		different from the later version defined in :cite:`Henry:1976`.
263
264		:param str word: the word to transform
265		:param int max_length: the length of the code returned (defaults to 3)
266		:returns: the early Henry code
267		:rtype: str
268
269		>>> henry_early('Marchand')
270		'MRC'
271		>>> henry_early('Beaulieu')
272		'BL'
273		>>> henry_early('Beaumont')
274		'BM'
275		>>> henry_early('Legrand')
276		'LGR'
277		>>> henry_early('Pelletier')
278		'PLT'
279		"""
280	1	_cons = {
281		'B',
282		'C',
283		'D',
284		'F',
285		'G',
286		'H',
287		'J',
288		'K',
289		'L',
290		'M',
291		'N',
292		'P',
293		'Q',
294		'R',
295		'S',
296		'T',
297		'V',
298		'W',
299		'X',
300		'Z',
301		}
302	1	_vows = {'A', 'E', 'I', 'O', 'U', 'Y'}
303	1	_diph = {
304		'AI': 'E',
305		'AY': 'E',
306		'EI': 'E',
307		'AU': 'O',
308		'OI': 'O',
309		'OU': 'O',
310		'EU': 'U',
311		}
312		# _unaltered = {'B', 'D', 'F', 'J', 'K', 'L', 'M', 'N', 'R', 'T', 'V'}
313	1	_simple = {'W': 'V', 'X': 'S', 'Z': 'S'}
314
315	1	word = unicode_normalize('NFKD', text_type(word.upper()))
316	1	word = ''.join(
317		c
318		for c in word
319		if c
320		in {
321		'A',
322		'B',
323		'C',
324		'D',
325		'E',
326		'F',
327		'G',
328		'H',
329		'I',
330		'J',
331		'K',
332		'L',
333		'M',
334		'N',
335		'O',
336		'P',
337		'Q',
338		'R',
339		'S',
340		'T',
341		'U',
342		'V',
343		'W',
344		'X',
345		'Y',
346		'Z',
347		}
348		)
349
350	1	if not word:
351	1	return ''
352
353		# Rule Ia seems to be covered entirely in II
354
355		# Rule Ib
356	1	if word[0] in _vows:
357		# Ib1
358	1	if (word[1:2] in _cons - {'M', 'N'} and word[2:3] in _cons) or (
359		word[1:2] in _cons and word[2:3] not in _cons
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
360		):
361	1	if word[0] == 'Y':
362	1	word = 'I' + word[1:]
363		# Ib2
364	1	elif word[1:2] in {'M', 'N'} and word[2:3] in _cons:
365	1	if word[0] == 'E':
366	1	word = 'A' + word[1:]
367	1	elif word[0] in {'I', 'U', 'Y'}:
368	1	word = 'E' + word[1:]
369		# Ib3
370	1	elif word[:2] in _diph:
371	1	word = _diph[word[:2]] + word[2:]
372		# Ib4
373	1	elif word[1:2] in _vows and word[0] == 'Y':
374	1	word = 'I' + word[1:]
375
376	1	code = ''
377	1	skip = 0
378
379		# Rule II
380	1	for pos, char in enumerate(word):
381	1	nxch = word[pos + 1 : pos + 2]
382	1	prev = word[pos - 1 : pos]
383
384	1	if skip:
385	1	skip -= 1
386	1	elif char in _vows:
387	1	code += char
388		# IIc
389	1	elif char == nxch:
390	1	skip = 1
391	1	code += char
392	1	elif word[pos : pos + 2] in {'CQ', 'DT', 'SC'}:
393	1	continue
394		# IIb
395	1	elif char in _simple:
396	1	code += _simple[char]
397	1	elif char in {'C', 'G', 'P', 'Q', 'S'}:
398	1	if char == 'C':
399	1	if nxch in {'A', 'O', 'U', 'L', 'R'}:
400	1	code += 'K'
401	1	elif nxch in {'E', 'I', 'Y'}:
402	1	code += 'S'
403	1	elif nxch == 'H':
404	1	if word[pos + 2 : pos + 3] in _vows:
405	1	code += 'C'
406		else: # CHR, CHL, etc.
407	1	code += 'K'
408		else:
409	1	code += 'C'
410	1	elif char == 'G':
411	1	if nxch in {'A', 'O', 'U', 'L', 'R'}:
412	1	code += 'G'
413	1	elif nxch in {'E', 'I', 'Y'}:
414	1	code += 'J'
415	1	elif nxch == 'N':
416	1	code += 'N'
417	1	elif char == 'P':
418	1	if nxch != 'H':
419	1	code += 'P'
420		else:
421	1	code += 'F'
422	1	elif char == 'Q':
423	1	if word[pos + 1 : pos + 3] in {'UE', 'UI', 'UY'}:
424	1	code += 'G'
425		else: # QUA, QUO, etc.
426	1	code += 'K'
427		else: # S...
428	1	if word[pos : pos + 6] == 'SAINTE':
429	1	code += 'X'
430	1	skip = 5
431	1	elif word[pos : pos + 5] == 'SAINT':
432	1	code += 'X'
433	1	skip = 4
434	1	elif word[pos : pos + 3] == 'STE':
435	1	code += 'X'
436	1	skip = 2
437	1	elif word[pos : pos + 2] == 'ST':
438	1	code += 'X'
439	1	skip = 1
440	1	elif nxch in _cons:
441	1	continue
442		else:
443	1	code += 'S'
444		# IId
445	1	elif char == 'H' and prev in _cons:
446	1	continue
447	1	elif char in _cons - {'L', 'R'} and nxch in _cons - {'L', 'R'}:
448	1	continue
449	1	elif char == 'L' and nxch in {'M', 'N'}:
450	1	continue
451	1	elif char in {'M', 'N'} and prev in _vows and nxch in _cons:
452	1	continue
453		# IIa
454		else:
455	1	code += char
456
457		# IIe1
458	1	if code[-4:] in {'AULT', 'EULT', 'OULT'}:
459	1	code = code[:-2]
460		# The following are blocked by rules above
461		# elif code[-4:-3] in _vows and code[-3:] == 'MPS':
462		# code = code[:-3]
463		# elif code[-3:-2] in _vows and code[-2:] in {'MB', 'MP', 'ND',
464		# 'NS', 'NT'}:
465		# code = code[:-2]
466	1	elif code[-2:-1] == 'R' and code[-1:] in _cons:
467	1	code = code[:-1]
468		# IIe2
469	1	elif code[-2:-1] in _vows and code[-1:] in {'D', 'M', 'N', 'S', 'T'}:
470	1	code = code[:-1]
471	1	elif code[-2:] == 'ER':
472	1	code = code[:-1]
473
474		# Drop non-initial vowels
475	1	code = code[:1] + code[1:].translate(
476		{65: '', 69: '', 73: '', 79: '', 85: '', 89: ''}
477		)
478
479	1	if max_length != -1:
480	1	code = code[:max_length]
481
482	1	return code
483
484
485		if __name__ == '__main__':
486		import doctest
487
488		doctest.testmod()
489

chrislit / abydos

Branch — master (78a222)

abydos.phonetic._fr.fonem() B

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like