abydos.phonetic._fr.HenryEarly.encode() - Code Metrics - Inspection of "0.3.6" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#138)

by Chris

created 2018-11-05 04:07 UTC

abydos.phonetic._fr.HenryEarly.encode() F

↳ Parent: abydos.phonetic._fr

Complexity

Conditions

Size

Total Lines	171
Code Lines	111

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	96
CRAP Score	56

Importance

Changes

Metric	Value
eloc	111
dl	0
loc	171
ccs	96
cts	96
cp	1
rs	0
c	0
b	0
f	0
cc	56
nop	3
crap	56

How to fix Long Method Complexity

# -*- coding: utf-8 -*-

# Copyright 2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._fr.

The phonetic._fr module implements phonetic algorithms intended for French,
including:

    - FONEM
    - an early version of Henry Code
"""

from __future__ import unicode_literals

from re import compile as re_compile
from unicodedata import normalize as unicode_normalize

from six import text_type

from ._phonetic import Phonetic

__all__ = ['FONEM', 'HenryEarly', 'fonem', 'henry_early']


class FONEM(Phonetic):

    """FONEM.

    FONEM is a phonetic algorithm designed for French (particularly surnames in
    Saguenay, Canada), defined in :cite:`Bouchard:1981`.

    Guillaume Plique's Javascript implementation :cite:`Plique:2018` at
    https://github.com/Yomguithereal/talisman/blob/master/src/phonetics/french/fonem.js
    was also consulted for this implementation.
    """

    # I don't see a sane way of doing this without regexps :(
    _rule_table = {
        # Vowels & groups of vowels
        'V-1': (re_compile('E?AU'), 'O'),
        'V-2,5': (re_compile('(E?AU|O)L[TX]$'), 'O'),
        'V-3,4': (re_compile('E?AU[TX]$'), 'O'),
        'V-6': (re_compile('E?AUL?D$'), 'O'),
        'V-7': (re_compile(r'(?<!G)AY$'), 'E'),
        'V-8': (re_compile('EUX$'), 'EU'),
        'V-9': (re_compile('EY(?=$|[BCDFGHJKLMNPQRSTVWXZ])'), 'E'),
        'V-10': ('Y', 'I'),
        'V-11': (re_compile('(?<=[AEIOUY])I(?=[AEIOUY])'), 'Y'),
        'V-12': (re_compile('(?<=[AEIOUY])ILL'), 'Y'),
        'V-13': (re_compile('OU(?=[AEOU]|I(?!LL))'), 'W'),
        'V-14': (re_compile(r'([AEIOUY])(?=\1)'), ''),
        # Nasal vowels
        'V-15': (re_compile('[AE]M(?=[BCDFGHJKLMPQRSTVWXZ])(?!$)'), 'EN'),
        'V-16': (re_compile('OM(?=[BCDFGHJKLMPQRSTVWXZ])'), 'ON'),
        'V-17': (re_compile('AN(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'EN'),
        'V-18': (re_compile('(AI[MN]|EIN)(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'IN'),
        'V-19': (re_compile('B(O|U|OU)RNE?$'), 'BURN'),
        'V-20': (
            re_compile(
                '(^IM|(?<=[BCDFGHJKLMNPQRSTVWXZ])'
                + 'IM(?=[BCDFGHJKLMPQRSTVWXZ]))'
            ),
            'IN',
        ),
        # Consonants and groups of consonants
        'C-1': ('BV', 'V'),
        'C-2': (re_compile('(?<=[AEIOUY])C(?=[EIY])'), 'SS'),
        'C-3': (re_compile('(?<=[BDFGHJKLMNPQRSTVWZ])C(?=[EIY])'), 'S'),
        'C-4': (re_compile('^C(?=[EIY])'), 'S'),
        'C-5': (re_compile('^C(?=[OUA])'), 'K'),
        'C-6': (re_compile('(?<=[AEIOUY])C$'), 'K'),
        'C-7': (re_compile('C(?=[BDFGJKLMNPQRSTVWXZ])'), 'K'),
        'C-8': (re_compile('CC(?=[AOU])'), 'K'),
        'C-9': (re_compile('CC(?=[EIY])'), 'X'),
        'C-10': (re_compile('G(?=[EIY])'), 'J'),
        'C-11': (re_compile('GA(?=I?[MN])'), 'G#'),
        'C-12': (re_compile('GE(O|AU)'), 'JO'),
        'C-13': (re_compile('GNI(?=[AEIOUY])'), 'GN'),
        'C-14': (re_compile('(?<![PCS])H'), ''),
        'C-15': ('JEA', 'JA'),
        'C-16': (re_compile('^MAC(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'MA#'),
        'C-17': (re_compile('^MC'), 'MA#'),
        'C-18': ('PH', 'F'),
        'C-19': ('QU', 'K'),
        'C-20': (re_compile('^SC(?=[EIY])'), 'S'),
        'C-21': (re_compile('(?<=.)SC(?=[EIY])'), 'SS'),
        'C-22': (re_compile('(?<=.)SC(?=[AOU])'), 'SK'),
        'C-23': ('SH', 'CH'),
        'C-24': (re_compile('TIA$'), 'SSIA'),
        'C-25': (re_compile('(?<=[AIOUY])W'), ''),
        'C-26': (re_compile('X[CSZ]'), 'X'),
        'C-27': (
            re_compile(
                '(?<=[AEIOUY])Z|(?<=[BCDFGHJKLMNPQRSTVWXZ])'
                + 'Z(?=[BCDFGHJKLMNPQRSTVWXZ])'
            ),
            'S',
        ),
        'C-28': (re_compile(r'([BDFGHJKMNPQRTVWXZ])\1'), r'\1'),
        'C-28a': (re_compile('CC(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'C'),
        'C-28b': (re_compile('((?<=[BCDFGHJKLMNPQRSTVWXZ])|^)SS'), 'S'),
        'C-28bb': (re_compile('SS(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'S'),
        'C-28c': (re_compile('((?<=[^I])|^)LL'), 'L'),
        'C-28d': (re_compile('ILE$'), 'ILLE'),
        'C-29': (
            re_compile(
                '(ILS|[CS]H|[MN]P|R[CFKLNSX])$|([BCDFGHJKL'
                + 'MNPQRSTVWXZ])[BCDFGHJKLMNPQRSTVWXZ]$'
            ),
            lambda m: (m.group(1) or '') + (m.group(2) or ''),
        ),
        'C-30,32': (re_compile('^(SA?INT?|SEI[NM]|CINQ?|ST)(?!E)-?'), 'ST-'),
        'C-31,33': (re_compile('^(SAINTE|STE)-?'), 'STE-'),
        # Rules to undo rule bleeding prevention in C-11, C-16, C-17
        'C-34': ('G#', 'GA'),
        'C-35': ('MA#', 'MAC'),
    }
    _rule_order = (
        'V-14',
        'C-28',
        'C-28a',
        'C-28b',
        'C-28bb',
        'C-28c',
        'C-28d',
        'C-12',
        'C-8',
        'C-9',
        'C-10',
        'C-16',
        'C-17',
        'C-2',
        'C-3',
        'C-7',
        'V-2,5',
        'V-3,4',
        'V-6',
        'V-1',
        'C-14',
        'C-31,33',
        'C-30,32',
        'C-11',
        'V-15',
        'V-17',
        'V-18',
        'V-7',
        'V-8',
        'V-9',
        'V-10',
        'V-11',
        'V-12',
        'V-13',
        'V-16',
        'V-19',
        'V-20',
        'C-1',
        'C-4',
        'C-5',
        'C-6',
        'C-13',
        'C-15',
        'C-18',
        'C-19',
        'C-20',
        'C-21',
        'C-22',
        'C-23',
        'C-24',
        'C-25',
        'C-26',
        'C-27',
        'C-29',
        'V-14',
        'C-28',
        'C-28a',
        'C-28b',
        'C-28bb',
        'C-28c',
        'C-28d',
        'C-34',
        'C-35',
    )

    _uc_set = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ-')

    def encode(self, word):
        """Return the FONEM code of a word.

        :param str word: the word to transform
        :returns: the FONEM code
        :rtype: str

        >>> pe = FONEM()
        >>> pe.encode('Marchand')
        'MARCHEN'
        >>> pe.encode('Beaulieu')
        'BOLIEU'
        >>> pe.encode('Beaumont')
        'BOMON'
        >>> pe.encode('Legrand')
        'LEGREN'
        >>> pe.encode('Pelletier')
        'PELETIER'
        """
        # normalize, upper-case, and filter non-French letters
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.translate({198: 'AE', 338: 'OE'})
        word = ''.join(c for c in word if c in self._uc_set)

        for rule in self._rule_order:
            regex, repl = self._rule_table[rule]
            if isinstance(regex, text_type):
                word = word.replace(regex, repl)
            else:
                word = regex.sub(repl, word)

        return word


def fonem(word):
    """Return the FONEM code of a word.

    This is a wrapper for :py:meth:`FONEM.encode`.

    :param str word: the word to transform
    :returns: the FONEM code
    :rtype: str

    >>> fonem('Marchand')
    'MARCHEN'
    >>> fonem('Beaulieu')
    'BOLIEU'
    >>> fonem('Beaumont')
    'BOMON'
    >>> fonem('Legrand')
    'LEGREN'
    >>> fonem('Pelletier')
    'PELETIER'
    """
    return FONEM().encode(word)


class HenryEarly(Phonetic):

    """Henry code, early version.

    The early version of Henry coding is given in :cite:`Legare:1972`. This is
    different from the later version defined in :cite:`Henry:1976`.
    """

    _uc_c_set = set('BCDFGHJKLMNPQRSTVWXZ')
    _diph = {
        'AI': 'E',
        'AY': 'E',
        'EI': 'E',
        'AU': 'O',
        'OI': 'O',
        'OU': 'O',
        'EU': 'U',
    }
    _simple = {'W': 'V', 'X': 'S', 'Z': 'S'}

    def encode(self, word, max_length=3):

        """Calculate the early version of the Henry code for a word.

        :param str word: the word to transform
        :param int max_length: the length of the code returned (defaults to 3)
        :returns: the early Henry code
        :rtype: str

        >>> henry_early('Marchand')
        'MRC'
        >>> henry_early('Beaulieu')
        'BL'
        >>> henry_early('Beaumont')
        'BM'
        >>> henry_early('Legrand')
        'LGR'
        >>> henry_early('Pelletier')
        'PLT'
        """
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = ''.join(c for c in word if c in self._uc_set)

        if not word:
            return ''

        # Rule Ia seems to be covered entirely in II

        # Rule Ib
        if word[0] in self._uc_vy_set:
            # Ib1
            if (
                word[1:2] in self._uc_c_set - {'M', 'N'}

                and word[2:3] in self._uc_c_set

            ) or (
                word[1:2] in self._uc_c_set and word[2:3] not in self._uc_c_set
            ):
                if word[0] == 'Y':
                    word = 'I' + word[1:]
            # Ib2
            elif word[1:2] in {'M', 'N'} and word[2:3] in self._uc_c_set:
                if word[0] == 'E':
                    word = 'A' + word[1:]
                elif word[0] in {'I', 'U', 'Y'}:
                    word = 'E' + word[1:]
            # Ib3
            elif word[:2] in self._diph:
                word = self._diph[word[:2]] + word[2:]
            # Ib4
            elif word[1:2] in self._uc_vy_set and word[0] == 'Y':
                word = 'I' + word[1:]

        code = ''
        skip = 0

        # Rule II
        for pos, char in enumerate(word):
            nxch = word[pos + 1 : pos + 2]
            prev = word[pos - 1 : pos]

            if skip:
                skip -= 1
            elif char in self._uc_vy_set:
                code += char
            # IIc
            elif char == nxch:
                skip = 1
                code += char
            elif word[pos : pos + 2] in {'CQ', 'DT', 'SC'}:
                continue
            # IIb
            elif char in self._simple:
                code += self._simple[char]
            elif char in {'C', 'G', 'P', 'Q', 'S'}:
                if char == 'C':
                    if nxch in {'A', 'O', 'U', 'L', 'R'}:
                        code += 'K'
                    elif nxch in {'E', 'I', 'Y'}:
                        code += 'S'
                    elif nxch == 'H':
                        if word[pos + 2 : pos + 3] in self._uc_vy_set:
                            code += 'C'
                        else:  # CHR, CHL, etc.
                            code += 'K'
                    else:
                        code += 'C'
                elif char == 'G':
                    if nxch in {'A', 'O', 'U', 'L', 'R'}:
                        code += 'G'
                    elif nxch in {'E', 'I', 'Y'}:
                        code += 'J'
                    elif nxch == 'N':
                        code += 'N'
                elif char == 'P':
                    if nxch != 'H':
                        code += 'P'
                    else:
                        code += 'F'
                elif char == 'Q':
                    if word[pos + 1 : pos + 3] in {'UE', 'UI', 'UY'}:
                        code += 'G'
                    else:  # QUA, QUO, etc.
                        code += 'K'
                else:  # S...
                    if word[pos : pos + 6] == 'SAINTE':
                        code += 'X'
                        skip = 5
                    elif word[pos : pos + 5] == 'SAINT':
                        code += 'X'
                        skip = 4
                    elif word[pos : pos + 3] == 'STE':
                        code += 'X'
                        skip = 2
                    elif word[pos : pos + 2] == 'ST':
                        code += 'X'
                        skip = 1
                    elif nxch in self._uc_c_set:
                        continue
                    else:
                        code += 'S'
            # IId
            elif char == 'H' and prev in self._uc_c_set:
                continue
            elif char in self._uc_c_set - {
                'L',

                'R',

            } and nxch in self._uc_c_set - {'L', 'R'}:
                continue
            elif char == 'L' and nxch in {'M', 'N'}:
                continue
            elif (
                char in {'M', 'N'}

                and prev in self._uc_vy_set

                and nxch in self._uc_c_set

            ):
                continue
            # IIa
            else:
                code += char

        # IIe1
        if code[-4:] in {'AULT', 'EULT', 'OULT'}:
            code = code[:-2]
        # The following are blocked by rules above
        # elif code[-4:-3] in _vows and code[-3:] == 'MPS':
        #    code = code[:-3]
        # elif code[-3:-2] in _vows and code[-2:] in {'MB', 'MP', 'ND',
        #                                             'NS', 'NT'}:
        #    code = code[:-2]
        elif code[-2:-1] == 'R' and code[-1:] in self._uc_c_set:
            code = code[:-1]
        # IIe2
        elif code[-2:-1] in self._uc_vy_set and code[-1:] in {
            'D',

            'M',

            'N',

            'S',

            'T',

        }:
            code = code[:-1]
        elif code[-2:] == 'ER':
            code = code[:-1]

        # Drop non-initial vowels
        code = code[:1] + code[1:].translate(
            {65: '', 69: '', 73: '', 79: '', 85: '', 89: ''}
        )

        if max_length != -1:
            code = code[:max_length]

        return code


def henry_early(word, max_length=3):
    """Calculate the early version of the Henry code for a word.

    This is a wrapper for :py:meth:`HenryEarly.encode`.

    :param str word: the word to transform
    :param int max_length: the length of the code returned (defaults to 3)
    :returns: the early Henry code
    :rtype: str

    >>> henry_early('Marchand')
    'MRC'
    >>> henry_early('Beaulieu')
    'BL'
    >>> henry_early('Beaumont')
    'BM'
    >>> henry_early('Legrand')
    'LGR'
    >>> henry_early('Pelletier')
    'PLT'
    """
    return HenryEarly().encode(word, max_length)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.phonetic._fr.
20
21		The phonetic._fr module implements phonetic algorithms intended for French,
22		including:
23
24		- FONEM
25		- an early version of Henry Code
26		"""
27
28	1	from __future__ import unicode_literals
29
30	1	from re import compile as re_compile
31	1	from unicodedata import normalize as unicode_normalize
32
33	1	from six import text_type
34
35	1	from ._phonetic import Phonetic
36
37	1	__all__ = ['FONEM', 'HenryEarly', 'fonem', 'henry_early']
38
39
40	1	class FONEM(Phonetic):
		0 ignored issues – show Unused Code introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
41		"""FONEM.
42
43		FONEM is a phonetic algorithm designed for French (particularly surnames in
44		Saguenay, Canada), defined in :cite:`Bouchard:1981`.
45
46		Guillaume Plique's Javascript implementation :cite:`Plique:2018` at
47		https://github.com/Yomguithereal/talisman/blob/master/src/phonetics/french/fonem.js
48		was also consulted for this implementation.
49		"""
50
51		# I don't see a sane way of doing this without regexps :(
52	1	_rule_table = {
53		# Vowels & groups of vowels
54		'V-1': (re_compile('E?AU'), 'O'),
55		'V-2,5': (re_compile('(E?AU\|O)L[TX]$'), 'O'),
56		'V-3,4': (re_compile('E?AU[TX]$'), 'O'),
57		'V-6': (re_compile('E?AUL?D$'), 'O'),
58		'V-7': (re_compile(r'(?<!G)AY$'), 'E'),
59		'V-8': (re_compile('EUX$'), 'EU'),
60		'V-9': (re_compile('EY(?=$\|[BCDFGHJKLMNPQRSTVWXZ])'), 'E'),
61		'V-10': ('Y', 'I'),
62		'V-11': (re_compile('(?<=[AEIOUY])I(?=[AEIOUY])'), 'Y'),
63		'V-12': (re_compile('(?<=[AEIOUY])ILL'), 'Y'),
64		'V-13': (re_compile('OU(?=[AEOU]\|I(?!LL))'), 'W'),
65		'V-14': (re_compile(r'([AEIOUY])(?=\1)'), ''),
66		# Nasal vowels
67		'V-15': (re_compile('[AE]M(?=[BCDFGHJKLMPQRSTVWXZ])(?!$)'), 'EN'),
68		'V-16': (re_compile('OM(?=[BCDFGHJKLMPQRSTVWXZ])'), 'ON'),
69		'V-17': (re_compile('AN(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'EN'),
70		'V-18': (re_compile('(AI[MN]\|EIN)(?=[BCDFGHJKLMNPQRSTVWXZ]\|$)'), 'IN'),
71		'V-19': (re_compile('B(O\|U\|OU)RNE?$'), 'BURN'),
72		'V-20': (
73		re_compile(
74		'(^IM\|(?<=[BCDFGHJKLMNPQRSTVWXZ])'
75		+ 'IM(?=[BCDFGHJKLMPQRSTVWXZ]))'
76		),
77		'IN',
78		),
79		# Consonants and groups of consonants
80		'C-1': ('BV', 'V'),
81		'C-2': (re_compile('(?<=[AEIOUY])C(?=[EIY])'), 'SS'),
82		'C-3': (re_compile('(?<=[BDFGHJKLMNPQRSTVWZ])C(?=[EIY])'), 'S'),
83		'C-4': (re_compile('^C(?=[EIY])'), 'S'),
84		'C-5': (re_compile('^C(?=[OUA])'), 'K'),
85		'C-6': (re_compile('(?<=[AEIOUY])C$'), 'K'),
86		'C-7': (re_compile('C(?=[BDFGJKLMNPQRSTVWXZ])'), 'K'),
87		'C-8': (re_compile('CC(?=[AOU])'), 'K'),
88		'C-9': (re_compile('CC(?=[EIY])'), 'X'),
89		'C-10': (re_compile('G(?=[EIY])'), 'J'),
90		'C-11': (re_compile('GA(?=I?[MN])'), 'G#'),
91		'C-12': (re_compile('GE(O\|AU)'), 'JO'),
92		'C-13': (re_compile('GNI(?=[AEIOUY])'), 'GN'),
93		'C-14': (re_compile('(?<![PCS])H'), ''),
94		'C-15': ('JEA', 'JA'),
95		'C-16': (re_compile('^MAC(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'MA#'),
96		'C-17': (re_compile('^MC'), 'MA#'),
97		'C-18': ('PH', 'F'),
98		'C-19': ('QU', 'K'),
99		'C-20': (re_compile('^SC(?=[EIY])'), 'S'),
100		'C-21': (re_compile('(?<=.)SC(?=[EIY])'), 'SS'),
101		'C-22': (re_compile('(?<=.)SC(?=[AOU])'), 'SK'),
102		'C-23': ('SH', 'CH'),
103		'C-24': (re_compile('TIA$'), 'SSIA'),
104		'C-25': (re_compile('(?<=[AIOUY])W'), ''),
105		'C-26': (re_compile('X[CSZ]'), 'X'),
106		'C-27': (
107		re_compile(
108		'(?<=[AEIOUY])Z\|(?<=[BCDFGHJKLMNPQRSTVWXZ])'
109		+ 'Z(?=[BCDFGHJKLMNPQRSTVWXZ])'
110		),
111		'S',
112		),
113		'C-28': (re_compile(r'([BDFGHJKMNPQRTVWXZ])\1'), r'\1'),
114		'C-28a': (re_compile('CC(?=[BCDFGHJKLMNPQRSTVWXZ]\|$)'), 'C'),
115		'C-28b': (re_compile('((?<=[BCDFGHJKLMNPQRSTVWXZ])\|^)SS'), 'S'),
116		'C-28bb': (re_compile('SS(?=[BCDFGHJKLMNPQRSTVWXZ]\|$)'), 'S'),
117		'C-28c': (re_compile('((?<=[^I])\|^)LL'), 'L'),
118		'C-28d': (re_compile('ILE$'), 'ILLE'),
119		'C-29': (
120		re_compile(
121		'(ILS\|[CS]H\|[MN]P\|R[CFKLNSX])$\|([BCDFGHJKL'
122		+ 'MNPQRSTVWXZ])[BCDFGHJKLMNPQRSTVWXZ]$'
123		),
124		lambda m: (m.group(1) or '') + (m.group(2) or ''),
125		),
126		'C-30,32': (re_compile('^(SA?INT?\|SEI[NM]\|CINQ?\|ST)(?!E)-?'), 'ST-'),
127		'C-31,33': (re_compile('^(SAINTE\|STE)-?'), 'STE-'),
128		# Rules to undo rule bleeding prevention in C-11, C-16, C-17
129		'C-34': ('G#', 'GA'),
130		'C-35': ('MA#', 'MAC'),
131		}
132	1	_rule_order = (
133		'V-14',
134		'C-28',
135		'C-28a',
136		'C-28b',
137		'C-28bb',
138		'C-28c',
139		'C-28d',
140		'C-12',
141		'C-8',
142		'C-9',
143		'C-10',
144		'C-16',
145		'C-17',
146		'C-2',
147		'C-3',
148		'C-7',
149		'V-2,5',
150		'V-3,4',
151		'V-6',
152		'V-1',
153		'C-14',
154		'C-31,33',
155		'C-30,32',
156		'C-11',
157		'V-15',
158		'V-17',
159		'V-18',
160		'V-7',
161		'V-8',
162		'V-9',
163		'V-10',
164		'V-11',
165		'V-12',
166		'V-13',
167		'V-16',
168		'V-19',
169		'V-20',
170		'C-1',
171		'C-4',
172		'C-5',
173		'C-6',
174		'C-13',
175		'C-15',
176		'C-18',
177		'C-19',
178		'C-20',
179		'C-21',
180		'C-22',
181		'C-23',
182		'C-24',
183		'C-25',
184		'C-26',
185		'C-27',
186		'C-29',
187		'V-14',
188		'C-28',
189		'C-28a',
190		'C-28b',
191		'C-28bb',
192		'C-28c',
193		'C-28d',
194		'C-34',
195		'C-35',
196		)
197
198	1	_uc_set = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ-')
199
200	1	def encode(self, word):
201		"""Return the FONEM code of a word.
202
203		:param str word: the word to transform
204		:returns: the FONEM code
205		:rtype: str
206
207		>>> pe = FONEM()
208		>>> pe.encode('Marchand')
209		'MARCHEN'
210		>>> pe.encode('Beaulieu')
211		'BOLIEU'
212		>>> pe.encode('Beaumont')
213		'BOMON'
214		>>> pe.encode('Legrand')
215		'LEGREN'
216		>>> pe.encode('Pelletier')
217		'PELETIER'
218		"""
219		# normalize, upper-case, and filter non-French letters
220	1	word = unicode_normalize('NFKD', text_type(word.upper()))
221	1	word = word.translate({198: 'AE', 338: 'OE'})
222	1	word = ''.join(c for c in word if c in self._uc_set)
223
224	1	for rule in self._rule_order:
225	1	regex, repl = self._rule_table[rule]
226	1	if isinstance(regex, text_type):
227	1	word = word.replace(regex, repl)
228		else:
229	1	word = regex.sub(repl, word)
230
231	1	return word
232
233
234	1	def fonem(word):
235		"""Return the FONEM code of a word.
236
237		This is a wrapper for :py:meth:`FONEM.encode`.
238
239		:param str word: the word to transform
240		:returns: the FONEM code
241		:rtype: str
242
243		>>> fonem('Marchand')
244		'MARCHEN'
245		>>> fonem('Beaulieu')
246		'BOLIEU'
247		>>> fonem('Beaumont')
248		'BOMON'
249		>>> fonem('Legrand')
250		'LEGREN'
251		>>> fonem('Pelletier')
252		'PELETIER'
253		"""
254	1	return FONEM().encode(word)
255
256
257	1	class HenryEarly(Phonetic):
		0 ignored issues – show Unused Code introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
258		"""Henry code, early version.
259
260		The early version of Henry coding is given in :cite:`Legare:1972`. This is
261		different from the later version defined in :cite:`Henry:1976`.
262		"""
263
264	1	_uc_c_set = set('BCDFGHJKLMNPQRSTVWXZ')
265	1	_diph = {
266		'AI': 'E',
267		'AY': 'E',
268		'EI': 'E',
269		'AU': 'O',
270		'OI': 'O',
271		'OU': 'O',
272		'EU': 'U',
273		}
274	1	_simple = {'W': 'V', 'X': 'S', 'Z': 'S'}
275
276	1	def encode(self, word, max_length=3):
		0 ignored issues – show Bug introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'encode' method Loading history...
277		"""Calculate the early version of the Henry code for a word.
278
279		:param str word: the word to transform
280		:param int max_length: the length of the code returned (defaults to 3)
281		:returns: the early Henry code
282		:rtype: str
283
284		>>> henry_early('Marchand')
285		'MRC'
286		>>> henry_early('Beaulieu')
287		'BL'
288		>>> henry_early('Beaumont')
289		'BM'
290		>>> henry_early('Legrand')
291		'LGR'
292		>>> henry_early('Pelletier')
293		'PLT'
294		"""
295	1	word = unicode_normalize('NFKD', text_type(word.upper()))
296	1	word = ''.join(c for c in word if c in self._uc_set)
297
298	1	if not word:
299	1	return ''
300
301		# Rule Ia seems to be covered entirely in II
302
303		# Rule Ib
304	1	if word[0] in self._uc_vy_set:
305		# Ib1
306	1	if (
307		word[1:2] in self._uc_c_set - {'M', 'N'}
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
308		and word[2:3] in self._uc_c_set
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
309		) or (
310		word[1:2] in self._uc_c_set and word[2:3] not in self._uc_c_set
311		):
312	1	if word[0] == 'Y':
313	1	word = 'I' + word[1:]
314		# Ib2
315	1	elif word[1:2] in {'M', 'N'} and word[2:3] in self._uc_c_set:
316	1	if word[0] == 'E':
317	1	word = 'A' + word[1:]
318	1	elif word[0] in {'I', 'U', 'Y'}:
319	1	word = 'E' + word[1:]
320		# Ib3
321	1	elif word[:2] in self._diph:
322	1	word = self._diph[word[:2]] + word[2:]
323		# Ib4
324	1	elif word[1:2] in self._uc_vy_set and word[0] == 'Y':
325	1	word = 'I' + word[1:]
326
327	1	code = ''
328	1	skip = 0
329
330		# Rule II
331	1	for pos, char in enumerate(word):
332	1	nxch = word[pos + 1 : pos + 2]
333	1	prev = word[pos - 1 : pos]
334
335	1	if skip:
336	1	skip -= 1
337	1	elif char in self._uc_vy_set:
338	1	code += char
339		# IIc
340	1	elif char == nxch:
341	1	skip = 1
342	1	code += char
343	1	elif word[pos : pos + 2] in {'CQ', 'DT', 'SC'}:
344	1	continue
345		# IIb
346	1	elif char in self._simple:
347	1	code += self._simple[char]
348	1	elif char in {'C', 'G', 'P', 'Q', 'S'}:
349	1	if char == 'C':
350	1	if nxch in {'A', 'O', 'U', 'L', 'R'}:
351	1	code += 'K'
352	1	elif nxch in {'E', 'I', 'Y'}:
353	1	code += 'S'
354	1	elif nxch == 'H':
355	1	if word[pos + 2 : pos + 3] in self._uc_vy_set:
356	1	code += 'C'
357		else: # CHR, CHL, etc.
358	1	code += 'K'
359		else:
360	1	code += 'C'
361	1	elif char == 'G':
362	1	if nxch in {'A', 'O', 'U', 'L', 'R'}:
363	1	code += 'G'
364	1	elif nxch in {'E', 'I', 'Y'}:
365	1	code += 'J'
366	1	elif nxch == 'N':
367	1	code += 'N'
368	1	elif char == 'P':
369	1	if nxch != 'H':
370	1	code += 'P'
371		else:
372	1	code += 'F'
373	1	elif char == 'Q':
374	1	if word[pos + 1 : pos + 3] in {'UE', 'UI', 'UY'}:
375	1	code += 'G'
376		else: # QUA, QUO, etc.
377	1	code += 'K'
378		else: # S...
379	1	if word[pos : pos + 6] == 'SAINTE':
380	1	code += 'X'
381	1	skip = 5
382	1	elif word[pos : pos + 5] == 'SAINT':
383	1	code += 'X'
384	1	skip = 4
385	1	elif word[pos : pos + 3] == 'STE':
386	1	code += 'X'
387	1	skip = 2
388	1	elif word[pos : pos + 2] == 'ST':
389	1	code += 'X'
390	1	skip = 1
391	1	elif nxch in self._uc_c_set:
392	1	continue
393		else:
394	1	code += 'S'
395		# IId
396	1	elif char == 'H' and prev in self._uc_c_set:
397	1	continue
398	1	elif char in self._uc_c_set - {
399		'L',
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
400		'R',
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
401		} and nxch in self._uc_c_set - {'L', 'R'}:
402	1	continue
403	1	elif char == 'L' and nxch in {'M', 'N'}:
404	1	continue
405	1	elif (
406		char in {'M', 'N'}
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
407		and prev in self._uc_vy_set
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
408		and nxch in self._uc_c_set
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
409		):
410	1	continue
411		# IIa
412		else:
413	1	code += char
414
415		# IIe1
416	1	if code[-4:] in {'AULT', 'EULT', 'OULT'}:
417	1	code = code[:-2]
418		# The following are blocked by rules above
419		# elif code[-4:-3] in _vows and code[-3:] == 'MPS':
420		# code = code[:-3]
421		# elif code[-3:-2] in _vows and code[-2:] in {'MB', 'MP', 'ND',
422		# 'NS', 'NT'}:
423		# code = code[:-2]
424	1	elif code[-2:-1] == 'R' and code[-1:] in self._uc_c_set:
425	1	code = code[:-1]
426		# IIe2
427	1	elif code[-2:-1] in self._uc_vy_set and code[-1:] in {
428		'D',
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
429		'M',
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
430		'N',
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
431		'S',
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
432		'T',
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
433		}:
434	1	code = code[:-1]
435	1	elif code[-2:] == 'ER':
436	1	code = code[:-1]
437
438		# Drop non-initial vowels
439	1	code = code[:1] + code[1:].translate(
440		{65: '', 69: '', 73: '', 79: '', 85: '', 89: ''}
441		)
442
443	1	if max_length != -1:
444	1	code = code[:max_length]
445
446	1	return code
447
448
449	1	def henry_early(word, max_length=3):
450		"""Calculate the early version of the Henry code for a word.
451
452		This is a wrapper for :py:meth:`HenryEarly.encode`.
453
454		:param str word: the word to transform
455		:param int max_length: the length of the code returned (defaults to 3)
456		:returns: the early Henry code
457		:rtype: str
458
459		>>> henry_early('Marchand')
460		'MRC'
461		>>> henry_early('Beaulieu')
462		'BL'
463		>>> henry_early('Beaumont')
464		'BM'
465		>>> henry_early('Legrand')
466		'LGR'
467		>>> henry_early('Pelletier')
468		'PLT'
469		"""
470	1	return HenryEarly().encode(word, max_length)
471
472
473		if __name__ == '__main__':
474		import doctest
475
476		doctest.testmod()
477

chrislit / abydos

Pull Request — master (#138)

abydos.phonetic._fr.HenryEarly.encode() F

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like