abydos.phonetic._fr.HenryEarly.encode() - Code Metrics - Inspection of "0.3.6" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#141)

by Chris

created 2018-11-08 03:44 UTC

abydos.phonetic._fr.HenryEarly.encode() F

↳ Parent: abydos.phonetic._fr

Complexity

Conditions

Size

Total Lines	175
Code Lines	111

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	96
CRAP Score	56

Importance

Changes

Metric	Value
eloc	111
dl	0
loc	175
ccs	96
cts	96
cp	1
rs	0
c	0
b	0
f	0
cc	56
nop	3
crap	56

How to fix Long Method Complexity

# -*- coding: utf-8 -*-

# Copyright 2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._fr.

The phonetic._fr module implements phonetic algorithms intended for French,
including:

    - FONEM
    - an early version of Henry Code
"""

from __future__ import unicode_literals

from re import compile as re_compile
from unicodedata import normalize as unicode_normalize

from six import text_type

from ._phonetic import Phonetic

__all__ = ['FONEM', 'HenryEarly', 'fonem', 'henry_early']


class FONEM(Phonetic):

    """FONEM.

    FONEM is a phonetic algorithm designed for French (particularly surnames in
    Saguenay, Canada), defined in :cite:`Bouchard:1981`.

    Guillaume Plique's Javascript implementation :cite:`Plique:2018` at
    https://github.com/Yomguithereal/talisman/blob/master/src/phonetics/french/fonem.js
    was also consulted for this implementation.
    """

    # I don't see a sane way of doing this without regexps :(
    _rule_table = {
        # Vowels & groups of vowels
        'V-1': (re_compile('E?AU'), 'O'),
        'V-2,5': (re_compile('(E?AU|O)L[TX]$'), 'O'),
        'V-3,4': (re_compile('E?AU[TX]$'), 'O'),
        'V-6': (re_compile('E?AUL?D$'), 'O'),
        'V-7': (re_compile(r'(?<!G)AY$'), 'E'),
        'V-8': (re_compile('EUX$'), 'EU'),
        'V-9': (re_compile('EY(?=$|[BCDFGHJKLMNPQRSTVWXZ])'), 'E'),
        'V-10': ('Y', 'I'),
        'V-11': (re_compile('(?<=[AEIOUY])I(?=[AEIOUY])'), 'Y'),
        'V-12': (re_compile('(?<=[AEIOUY])ILL'), 'Y'),
        'V-13': (re_compile('OU(?=[AEOU]|I(?!LL))'), 'W'),
        'V-14': (re_compile(r'([AEIOUY])(?=\1)'), ''),
        # Nasal vowels
        'V-15': (re_compile('[AE]M(?=[BCDFGHJKLMPQRSTVWXZ])(?!$)'), 'EN'),
        'V-16': (re_compile('OM(?=[BCDFGHJKLMPQRSTVWXZ])'), 'ON'),
        'V-17': (re_compile('AN(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'EN'),
        'V-18': (re_compile('(AI[MN]|EIN)(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'IN'),
        'V-19': (re_compile('B(O|U|OU)RNE?$'), 'BURN'),
        'V-20': (
            re_compile(
                '(^IM|(?<=[BCDFGHJKLMNPQRSTVWXZ])'
                + 'IM(?=[BCDFGHJKLMPQRSTVWXZ]))'
            ),
            'IN',
        ),
        # Consonants and groups of consonants
        'C-1': ('BV', 'V'),
        'C-2': (re_compile('(?<=[AEIOUY])C(?=[EIY])'), 'SS'),
        'C-3': (re_compile('(?<=[BDFGHJKLMNPQRSTVWZ])C(?=[EIY])'), 'S'),
        'C-4': (re_compile('^C(?=[EIY])'), 'S'),
        'C-5': (re_compile('^C(?=[OUA])'), 'K'),
        'C-6': (re_compile('(?<=[AEIOUY])C$'), 'K'),
        'C-7': (re_compile('C(?=[BDFGJKLMNPQRSTVWXZ])'), 'K'),
        'C-8': (re_compile('CC(?=[AOU])'), 'K'),
        'C-9': (re_compile('CC(?=[EIY])'), 'X'),
        'C-10': (re_compile('G(?=[EIY])'), 'J'),
        'C-11': (re_compile('GA(?=I?[MN])'), 'G#'),
        'C-12': (re_compile('GE(O|AU)'), 'JO'),
        'C-13': (re_compile('GNI(?=[AEIOUY])'), 'GN'),
        'C-14': (re_compile('(?<![PCS])H'), ''),
        'C-15': ('JEA', 'JA'),
        'C-16': (re_compile('^MAC(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'MA#'),
        'C-17': (re_compile('^MC'), 'MA#'),
        'C-18': ('PH', 'F'),
        'C-19': ('QU', 'K'),
        'C-20': (re_compile('^SC(?=[EIY])'), 'S'),
        'C-21': (re_compile('(?<=.)SC(?=[EIY])'), 'SS'),
        'C-22': (re_compile('(?<=.)SC(?=[AOU])'), 'SK'),
        'C-23': ('SH', 'CH'),
        'C-24': (re_compile('TIA$'), 'SSIA'),
        'C-25': (re_compile('(?<=[AIOUY])W'), ''),
        'C-26': (re_compile('X[CSZ]'), 'X'),
        'C-27': (
            re_compile(
                '(?<=[AEIOUY])Z|(?<=[BCDFGHJKLMNPQRSTVWXZ])'
                + 'Z(?=[BCDFGHJKLMNPQRSTVWXZ])'
            ),
            'S',
        ),
        'C-28': (re_compile(r'([BDFGHJKMNPQRTVWXZ])\1'), r'\1'),
        'C-28a': (re_compile('CC(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'C'),
        'C-28b': (re_compile('((?<=[BCDFGHJKLMNPQRSTVWXZ])|^)SS'), 'S'),
        'C-28bb': (re_compile('SS(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'S'),
        'C-28c': (re_compile('((?<=[^I])|^)LL'), 'L'),
        'C-28d': (re_compile('ILE$'), 'ILLE'),
        'C-29': (
            re_compile(
                '(ILS|[CS]H|[MN]P|R[CFKLNSX])$|([BCDFGHJKL'
                + 'MNPQRSTVWXZ])[BCDFGHJKLMNPQRSTVWXZ]$'
            ),
            lambda m: (m.group(1) or '') + (m.group(2) or ''),
        ),
        'C-30,32': (re_compile('^(SA?INT?|SEI[NM]|CINQ?|ST)(?!E)-?'), 'ST-'),
        'C-31,33': (re_compile('^(SAINTE|STE)-?'), 'STE-'),
        # Rules to undo rule bleeding prevention in C-11, C-16, C-17
        'C-34': ('G#', 'GA'),
        'C-35': ('MA#', 'MAC'),
    }
    _rule_order = (
        'V-14',
        'C-28',
        'C-28a',
        'C-28b',
        'C-28bb',
        'C-28c',
        'C-28d',
        'C-12',
        'C-8',
        'C-9',
        'C-10',
        'C-16',
        'C-17',
        'C-2',
        'C-3',
        'C-7',
        'V-2,5',
        'V-3,4',
        'V-6',
        'V-1',
        'C-14',
        'C-31,33',
        'C-30,32',
        'C-11',
        'V-15',
        'V-17',
        'V-18',
        'V-7',
        'V-8',
        'V-9',
        'V-10',
        'V-11',
        'V-12',
        'V-13',
        'V-16',
        'V-19',
        'V-20',
        'C-1',
        'C-4',
        'C-5',
        'C-6',
        'C-13',
        'C-15',
        'C-18',
        'C-19',
        'C-20',
        'C-21',
        'C-22',
        'C-23',
        'C-24',
        'C-25',
        'C-26',
        'C-27',
        'C-29',
        'V-14',
        'C-28',
        'C-28a',
        'C-28b',
        'C-28bb',
        'C-28c',
        'C-28d',
        'C-34',
        'C-35',
    )

    _uc_set = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ-')

    def encode(self, word):
        """Return the FONEM code of a word.

        Args:
            word (str): The word to transform

        Returns:
            str: The FONEM code

        Examples:
            >>> pe = FONEM()
            >>> pe.encode('Marchand')
            'MARCHEN'
            >>> pe.encode('Beaulieu')
            'BOLIEU'
            >>> pe.encode('Beaumont')
            'BOMON'
            >>> pe.encode('Legrand')
            'LEGREN'
            >>> pe.encode('Pelletier')
            'PELETIER'

        """
        # normalize, upper-case, and filter non-French letters
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.translate({198: 'AE', 338: 'OE'})
        word = ''.join(c for c in word if c in self._uc_set)

        for rule in self._rule_order:
            regex, repl = self._rule_table[rule]
            if isinstance(regex, text_type):
                word = word.replace(regex, repl)
            else:
                word = regex.sub(repl, word)

        return word


def fonem(word):
    """Return the FONEM code of a word.

    This is a wrapper for :py:meth:`FONEM.encode`.

    Args:
        word (str): The word to transform

    Returns:
        str: The FONEM code

    Examples:
        >>> fonem('Marchand')
        'MARCHEN'
        >>> fonem('Beaulieu')
        'BOLIEU'
        >>> fonem('Beaumont')
        'BOMON'
        >>> fonem('Legrand')
        'LEGREN'
        >>> fonem('Pelletier')
        'PELETIER'

    """
    return FONEM().encode(word)


class HenryEarly(Phonetic):

    """Henry code, early version.

    The early version of Henry coding is given in :cite:`Legare:1972`. This is
    different from the later version defined in :cite:`Henry:1976`.
    """

    _uc_c_set = set('BCDFGHJKLMNPQRSTVWXZ')
    _diph = {
        'AI': 'E',
        'AY': 'E',
        'EI': 'E',
        'AU': 'O',
        'OI': 'O',
        'OU': 'O',
        'EU': 'U',
    }
    _simple = {'W': 'V', 'X': 'S', 'Z': 'S'}

    def encode(self, word, max_length=3):

        """Calculate the early version of the Henry code for a word.

        Args:
            word (str): The word to transform
            max_length (int): The length of the code returned (defaults to 3)

        Returns:
            str: The early Henry code

        Examples:
            >>> henry_early('Marchand')
            'MRC'
            >>> henry_early('Beaulieu')
            'BL'
            >>> henry_early('Beaumont')
            'BM'
            >>> henry_early('Legrand')
            'LGR'
            >>> henry_early('Pelletier')
            'PLT'

        """
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = ''.join(c for c in word if c in self._uc_set)

        if not word:
            return ''

        # Rule Ia seems to be covered entirely in II

        # Rule Ib
        if word[0] in self._uc_vy_set:
            # Ib1
            if (
                word[1:2] in self._uc_c_set - {'M', 'N'}

                and word[2:3] in self._uc_c_set

            ) or (
                word[1:2] in self._uc_c_set and word[2:3] not in self._uc_c_set
            ):
                if word[0] == 'Y':
                    word = 'I' + word[1:]
            # Ib2
            elif word[1:2] in {'M', 'N'} and word[2:3] in self._uc_c_set:
                if word[0] == 'E':
                    word = 'A' + word[1:]
                elif word[0] in {'I', 'U', 'Y'}:
                    word = 'E' + word[1:]
            # Ib3
            elif word[:2] in self._diph:
                word = self._diph[word[:2]] + word[2:]
            # Ib4
            elif word[1:2] in self._uc_vy_set and word[0] == 'Y':
                word = 'I' + word[1:]

        code = ''
        skip = 0

        # Rule II
        for pos, char in enumerate(word):
            nxch = word[pos + 1 : pos + 2]
            prev = word[pos - 1 : pos]

            if skip:
                skip -= 1
            elif char in self._uc_vy_set:
                code += char
            # IIc
            elif char == nxch:
                skip = 1
                code += char
            elif word[pos : pos + 2] in {'CQ', 'DT', 'SC'}:
                continue
            # IIb
            elif char in self._simple:
                code += self._simple[char]
            elif char in {'C', 'G', 'P', 'Q', 'S'}:
                if char == 'C':
                    if nxch in {'A', 'O', 'U', 'L', 'R'}:
                        code += 'K'
                    elif nxch in {'E', 'I', 'Y'}:
                        code += 'S'
                    elif nxch == 'H':
                        if word[pos + 2 : pos + 3] in self._uc_vy_set:
                            code += 'C'
                        else:  # CHR, CHL, etc.
                            code += 'K'
                    else:
                        code += 'C'
                elif char == 'G':
                    if nxch in {'A', 'O', 'U', 'L', 'R'}:
                        code += 'G'
                    elif nxch in {'E', 'I', 'Y'}:
                        code += 'J'
                    elif nxch == 'N':
                        code += 'N'
                elif char == 'P':
                    if nxch != 'H':
                        code += 'P'
                    else:
                        code += 'F'
                elif char == 'Q':
                    if word[pos + 1 : pos + 3] in {'UE', 'UI', 'UY'}:
                        code += 'G'
                    else:  # QUA, QUO, etc.
                        code += 'K'
                else:  # S...
                    if word[pos : pos + 6] == 'SAINTE':
                        code += 'X'
                        skip = 5
                    elif word[pos : pos + 5] == 'SAINT':
                        code += 'X'
                        skip = 4
                    elif word[pos : pos + 3] == 'STE':
                        code += 'X'
                        skip = 2
                    elif word[pos : pos + 2] == 'ST':
                        code += 'X'
                        skip = 1
                    elif nxch in self._uc_c_set:
                        continue
                    else:
                        code += 'S'
            # IId
            elif char == 'H' and prev in self._uc_c_set:
                continue
            elif char in self._uc_c_set - {
                'L',

                'R',

            } and nxch in self._uc_c_set - {'L', 'R'}:
                continue
            elif char == 'L' and nxch in {'M', 'N'}:
                continue
            elif (
                char in {'M', 'N'}

                and prev in self._uc_vy_set

                and nxch in self._uc_c_set

            ):
                continue
            # IIa
            else:
                code += char

        # IIe1
        if code[-4:] in {'AULT', 'EULT', 'OULT'}:
            code = code[:-2]
        # The following are blocked by rules above
        # elif code[-4:-3] in _vows and code[-3:] == 'MPS':
        #    code = code[:-3]
        # elif code[-3:-2] in _vows and code[-2:] in {'MB', 'MP', 'ND',
        #                                             'NS', 'NT'}:
        #    code = code[:-2]
        elif code[-2:-1] == 'R' and code[-1:] in self._uc_c_set:
            code = code[:-1]
        # IIe2
        elif code[-2:-1] in self._uc_vy_set and code[-1:] in {
            'D',

            'M',

            'N',

            'S',

            'T',

        }:
            code = code[:-1]
        elif code[-2:] == 'ER':
            code = code[:-1]

        # Drop non-initial vowels
        code = code[:1] + code[1:].translate(
            {65: '', 69: '', 73: '', 79: '', 85: '', 89: ''}
        )

        if max_length != -1:
            code = code[:max_length]

        return code


def henry_early(word, max_length=3):
    """Calculate the early version of the Henry code for a word.

    This is a wrapper for :py:meth:`HenryEarly.encode`.

    Args:
        word (str): The word to transform
        max_length (int): The length of the code returned (defaults to 3)

    Returns:
        str: The early Henry code

    Examples:
        >>> henry_early('Marchand')
        'MRC'
        >>> henry_early('Beaulieu')
        'BL'
        >>> henry_early('Beaumont')
        'BM'
        >>> henry_early('Legrand')
        'LGR'
        >>> henry_early('Pelletier')
        'PLT'

    """
    return HenryEarly().encode(word, max_length)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.phonetic._fr.
20
21		The phonetic._fr module implements phonetic algorithms intended for French,
22		including:
23
24		- FONEM
25		- an early version of Henry Code
26		"""
27
28	1	from __future__ import unicode_literals
29
30	1	from re import compile as re_compile
31	1	from unicodedata import normalize as unicode_normalize
32
33	1	from six import text_type
34
35	1	from ._phonetic import Phonetic
36
37	1	__all__ = ['FONEM', 'HenryEarly', 'fonem', 'henry_early']
38
39
40	1	class FONEM(Phonetic):
		0 ignored issues – show Unused Code introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
41		"""FONEM.
42
43		FONEM is a phonetic algorithm designed for French (particularly surnames in
44		Saguenay, Canada), defined in :cite:`Bouchard:1981`.
45
46		Guillaume Plique's Javascript implementation :cite:`Plique:2018` at
47		https://github.com/Yomguithereal/talisman/blob/master/src/phonetics/french/fonem.js
48		was also consulted for this implementation.
49		"""
50
51		# I don't see a sane way of doing this without regexps :(
52	1	_rule_table = {
53		# Vowels & groups of vowels
54		'V-1': (re_compile('E?AU'), 'O'),
55		'V-2,5': (re_compile('(E?AU\|O)L[TX]$'), 'O'),
56		'V-3,4': (re_compile('E?AU[TX]$'), 'O'),
57		'V-6': (re_compile('E?AUL?D$'), 'O'),
58		'V-7': (re_compile(r'(?<!G)AY$'), 'E'),
59		'V-8': (re_compile('EUX$'), 'EU'),
60		'V-9': (re_compile('EY(?=$\|[BCDFGHJKLMNPQRSTVWXZ])'), 'E'),
61		'V-10': ('Y', 'I'),
62		'V-11': (re_compile('(?<=[AEIOUY])I(?=[AEIOUY])'), 'Y'),
63		'V-12': (re_compile('(?<=[AEIOUY])ILL'), 'Y'),
64		'V-13': (re_compile('OU(?=[AEOU]\|I(?!LL))'), 'W'),
65		'V-14': (re_compile(r'([AEIOUY])(?=\1)'), ''),
66		# Nasal vowels
67		'V-15': (re_compile('[AE]M(?=[BCDFGHJKLMPQRSTVWXZ])(?!$)'), 'EN'),
68		'V-16': (re_compile('OM(?=[BCDFGHJKLMPQRSTVWXZ])'), 'ON'),
69		'V-17': (re_compile('AN(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'EN'),
70		'V-18': (re_compile('(AI[MN]\|EIN)(?=[BCDFGHJKLMNPQRSTVWXZ]\|$)'), 'IN'),
71		'V-19': (re_compile('B(O\|U\|OU)RNE?$'), 'BURN'),
72		'V-20': (
73		re_compile(
74		'(^IM\|(?<=[BCDFGHJKLMNPQRSTVWXZ])'
75		+ 'IM(?=[BCDFGHJKLMPQRSTVWXZ]))'
76		),
77		'IN',
78		),
79		# Consonants and groups of consonants
80		'C-1': ('BV', 'V'),
81		'C-2': (re_compile('(?<=[AEIOUY])C(?=[EIY])'), 'SS'),
82		'C-3': (re_compile('(?<=[BDFGHJKLMNPQRSTVWZ])C(?=[EIY])'), 'S'),
83		'C-4': (re_compile('^C(?=[EIY])'), 'S'),
84		'C-5': (re_compile('^C(?=[OUA])'), 'K'),
85		'C-6': (re_compile('(?<=[AEIOUY])C$'), 'K'),
86		'C-7': (re_compile('C(?=[BDFGJKLMNPQRSTVWXZ])'), 'K'),
87		'C-8': (re_compile('CC(?=[AOU])'), 'K'),
88		'C-9': (re_compile('CC(?=[EIY])'), 'X'),
89		'C-10': (re_compile('G(?=[EIY])'), 'J'),
90		'C-11': (re_compile('GA(?=I?[MN])'), 'G#'),
91		'C-12': (re_compile('GE(O\|AU)'), 'JO'),
92		'C-13': (re_compile('GNI(?=[AEIOUY])'), 'GN'),
93		'C-14': (re_compile('(?<![PCS])H'), ''),
94		'C-15': ('JEA', 'JA'),
95		'C-16': (re_compile('^MAC(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'MA#'),
96		'C-17': (re_compile('^MC'), 'MA#'),
97		'C-18': ('PH', 'F'),
98		'C-19': ('QU', 'K'),
99		'C-20': (re_compile('^SC(?=[EIY])'), 'S'),
100		'C-21': (re_compile('(?<=.)SC(?=[EIY])'), 'SS'),
101		'C-22': (re_compile('(?<=.)SC(?=[AOU])'), 'SK'),
102		'C-23': ('SH', 'CH'),
103		'C-24': (re_compile('TIA$'), 'SSIA'),
104		'C-25': (re_compile('(?<=[AIOUY])W'), ''),
105		'C-26': (re_compile('X[CSZ]'), 'X'),
106		'C-27': (
107		re_compile(
108		'(?<=[AEIOUY])Z\|(?<=[BCDFGHJKLMNPQRSTVWXZ])'
109		+ 'Z(?=[BCDFGHJKLMNPQRSTVWXZ])'
110		),
111		'S',
112		),
113		'C-28': (re_compile(r'([BDFGHJKMNPQRTVWXZ])\1'), r'\1'),
114		'C-28a': (re_compile('CC(?=[BCDFGHJKLMNPQRSTVWXZ]\|$)'), 'C'),
115		'C-28b': (re_compile('((?<=[BCDFGHJKLMNPQRSTVWXZ])\|^)SS'), 'S'),
116		'C-28bb': (re_compile('SS(?=[BCDFGHJKLMNPQRSTVWXZ]\|$)'), 'S'),
117		'C-28c': (re_compile('((?<=[^I])\|^)LL'), 'L'),
118		'C-28d': (re_compile('ILE$'), 'ILLE'),
119		'C-29': (
120		re_compile(
121		'(ILS\|[CS]H\|[MN]P\|R[CFKLNSX])$\|([BCDFGHJKL'
122		+ 'MNPQRSTVWXZ])[BCDFGHJKLMNPQRSTVWXZ]$'
123		),
124		lambda m: (m.group(1) or '') + (m.group(2) or ''),
125		),
126		'C-30,32': (re_compile('^(SA?INT?\|SEI[NM]\|CINQ?\|ST)(?!E)-?'), 'ST-'),
127		'C-31,33': (re_compile('^(SAINTE\|STE)-?'), 'STE-'),
128		# Rules to undo rule bleeding prevention in C-11, C-16, C-17
129		'C-34': ('G#', 'GA'),
130		'C-35': ('MA#', 'MAC'),
131		}
132	1	_rule_order = (
133		'V-14',
134		'C-28',
135		'C-28a',
136		'C-28b',
137		'C-28bb',
138		'C-28c',
139		'C-28d',
140		'C-12',
141		'C-8',
142		'C-9',
143		'C-10',
144		'C-16',
145		'C-17',
146		'C-2',
147		'C-3',
148		'C-7',
149		'V-2,5',
150		'V-3,4',
151		'V-6',
152		'V-1',
153		'C-14',
154		'C-31,33',
155		'C-30,32',
156		'C-11',
157		'V-15',
158		'V-17',
159		'V-18',
160		'V-7',
161		'V-8',
162		'V-9',
163		'V-10',
164		'V-11',
165		'V-12',
166		'V-13',
167		'V-16',
168		'V-19',
169		'V-20',
170		'C-1',
171		'C-4',
172		'C-5',
173		'C-6',
174		'C-13',
175		'C-15',
176		'C-18',
177		'C-19',
178		'C-20',
179		'C-21',
180		'C-22',
181		'C-23',
182		'C-24',
183		'C-25',
184		'C-26',
185		'C-27',
186		'C-29',
187		'V-14',
188		'C-28',
189		'C-28a',
190		'C-28b',
191		'C-28bb',
192		'C-28c',
193		'C-28d',
194		'C-34',
195		'C-35',
196		)
197
198	1	_uc_set = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ-')
199
200	1	def encode(self, word):
201		"""Return the FONEM code of a word.
202
203		Args:
204		word (str): The word to transform
205
206		Returns:
207		str: The FONEM code
208
209		Examples:
210		>>> pe = FONEM()
211		>>> pe.encode('Marchand')
212		'MARCHEN'
213		>>> pe.encode('Beaulieu')
214		'BOLIEU'
215		>>> pe.encode('Beaumont')
216		'BOMON'
217		>>> pe.encode('Legrand')
218		'LEGREN'
219		>>> pe.encode('Pelletier')
220		'PELETIER'
221
222		"""
223		# normalize, upper-case, and filter non-French letters
224	1	word = unicode_normalize('NFKD', text_type(word.upper()))
225	1	word = word.translate({198: 'AE', 338: 'OE'})
226	1	word = ''.join(c for c in word if c in self._uc_set)
227
228	1	for rule in self._rule_order:
229	1	regex, repl = self._rule_table[rule]
230	1	if isinstance(regex, text_type):
231	1	word = word.replace(regex, repl)
232		else:
233	1	word = regex.sub(repl, word)
234
235	1	return word
236
237
238	1	def fonem(word):
239		"""Return the FONEM code of a word.
240
241		This is a wrapper for :py:meth:`FONEM.encode`.
242
243		Args:
244		word (str): The word to transform
245
246		Returns:
247		str: The FONEM code
248
249		Examples:
250		>>> fonem('Marchand')
251		'MARCHEN'
252		>>> fonem('Beaulieu')
253		'BOLIEU'
254		>>> fonem('Beaumont')
255		'BOMON'
256		>>> fonem('Legrand')
257		'LEGREN'
258		>>> fonem('Pelletier')
259		'PELETIER'
260
261		"""
262	1	return FONEM().encode(word)
263
264
265	1	class HenryEarly(Phonetic):
		0 ignored issues – show Unused Code introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
266		"""Henry code, early version.
267
268		The early version of Henry coding is given in :cite:`Legare:1972`. This is
269		different from the later version defined in :cite:`Henry:1976`.
270		"""
271
272	1	_uc_c_set = set('BCDFGHJKLMNPQRSTVWXZ')
273	1	_diph = {
274		'AI': 'E',
275		'AY': 'E',
276		'EI': 'E',
277		'AU': 'O',
278		'OI': 'O',
279		'OU': 'O',
280		'EU': 'U',
281		}
282	1	_simple = {'W': 'V', 'X': 'S', 'Z': 'S'}
283
284	1	def encode(self, word, max_length=3):
		0 ignored issues – show Bug introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'encode' method Loading history...
285		"""Calculate the early version of the Henry code for a word.
286
287		Args:
288		word (str): The word to transform
289		max_length (int): The length of the code returned (defaults to 3)
290
291		Returns:
292		str: The early Henry code
293
294		Examples:
295		>>> henry_early('Marchand')
296		'MRC'
297		>>> henry_early('Beaulieu')
298		'BL'
299		>>> henry_early('Beaumont')
300		'BM'
301		>>> henry_early('Legrand')
302		'LGR'
303		>>> henry_early('Pelletier')
304		'PLT'
305
306		"""
307	1	word = unicode_normalize('NFKD', text_type(word.upper()))
308	1	word = ''.join(c for c in word if c in self._uc_set)
309
310	1	if not word:
311	1	return ''
312
313		# Rule Ia seems to be covered entirely in II
314
315		# Rule Ib
316	1	if word[0] in self._uc_vy_set:
317		# Ib1
318	1	if (
319		word[1:2] in self._uc_c_set - {'M', 'N'}
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
320		and word[2:3] in self._uc_c_set
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
321		) or (
322		word[1:2] in self._uc_c_set and word[2:3] not in self._uc_c_set
323		):
324	1	if word[0] == 'Y':
325	1	word = 'I' + word[1:]
326		# Ib2
327	1	elif word[1:2] in {'M', 'N'} and word[2:3] in self._uc_c_set:
328	1	if word[0] == 'E':
329	1	word = 'A' + word[1:]
330	1	elif word[0] in {'I', 'U', 'Y'}:
331	1	word = 'E' + word[1:]
332		# Ib3
333	1	elif word[:2] in self._diph:
334	1	word = self._diph[word[:2]] + word[2:]
335		# Ib4
336	1	elif word[1:2] in self._uc_vy_set and word[0] == 'Y':
337	1	word = 'I' + word[1:]
338
339	1	code = ''
340	1	skip = 0
341
342		# Rule II
343	1	for pos, char in enumerate(word):
344	1	nxch = word[pos + 1 : pos + 2]
345	1	prev = word[pos - 1 : pos]
346
347	1	if skip:
348	1	skip -= 1
349	1	elif char in self._uc_vy_set:
350	1	code += char
351		# IIc
352	1	elif char == nxch:
353	1	skip = 1
354	1	code += char
355	1	elif word[pos : pos + 2] in {'CQ', 'DT', 'SC'}:
356	1	continue
357		# IIb
358	1	elif char in self._simple:
359	1	code += self._simple[char]
360	1	elif char in {'C', 'G', 'P', 'Q', 'S'}:
361	1	if char == 'C':
362	1	if nxch in {'A', 'O', 'U', 'L', 'R'}:
363	1	code += 'K'
364	1	elif nxch in {'E', 'I', 'Y'}:
365	1	code += 'S'
366	1	elif nxch == 'H':
367	1	if word[pos + 2 : pos + 3] in self._uc_vy_set:
368	1	code += 'C'
369		else: # CHR, CHL, etc.
370	1	code += 'K'
371		else:
372	1	code += 'C'
373	1	elif char == 'G':
374	1	if nxch in {'A', 'O', 'U', 'L', 'R'}:
375	1	code += 'G'
376	1	elif nxch in {'E', 'I', 'Y'}:
377	1	code += 'J'
378	1	elif nxch == 'N':
379	1	code += 'N'
380	1	elif char == 'P':
381	1	if nxch != 'H':
382	1	code += 'P'
383		else:
384	1	code += 'F'
385	1	elif char == 'Q':
386	1	if word[pos + 1 : pos + 3] in {'UE', 'UI', 'UY'}:
387	1	code += 'G'
388		else: # QUA, QUO, etc.
389	1	code += 'K'
390		else: # S...
391	1	if word[pos : pos + 6] == 'SAINTE':
392	1	code += 'X'
393	1	skip = 5
394	1	elif word[pos : pos + 5] == 'SAINT':
395	1	code += 'X'
396	1	skip = 4
397	1	elif word[pos : pos + 3] == 'STE':
398	1	code += 'X'
399	1	skip = 2
400	1	elif word[pos : pos + 2] == 'ST':
401	1	code += 'X'
402	1	skip = 1
403	1	elif nxch in self._uc_c_set:
404	1	continue
405		else:
406	1	code += 'S'
407		# IId
408	1	elif char == 'H' and prev in self._uc_c_set:
409	1	continue
410	1	elif char in self._uc_c_set - {
411		'L',
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
412		'R',
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
413		} and nxch in self._uc_c_set - {'L', 'R'}:
414	1	continue
415	1	elif char == 'L' and nxch in {'M', 'N'}:
416	1	continue
417	1	elif (
418		char in {'M', 'N'}
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
419		and prev in self._uc_vy_set
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
420		and nxch in self._uc_c_set
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
421		):
422	1	continue
423		# IIa
424		else:
425	1	code += char
426
427		# IIe1
428	1	if code[-4:] in {'AULT', 'EULT', 'OULT'}:
429	1	code = code[:-2]
430		# The following are blocked by rules above
431		# elif code[-4:-3] in _vows and code[-3:] == 'MPS':
432		# code = code[:-3]
433		# elif code[-3:-2] in _vows and code[-2:] in {'MB', 'MP', 'ND',
434		# 'NS', 'NT'}:
435		# code = code[:-2]
436	1	elif code[-2:-1] == 'R' and code[-1:] in self._uc_c_set:
437	1	code = code[:-1]
438		# IIe2
439	1	elif code[-2:-1] in self._uc_vy_set and code[-1:] in {
440		'D',
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
441		'M',
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
442		'N',
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
443		'S',
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
444		'T',
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
445		}:
446	1	code = code[:-1]
447	1	elif code[-2:] == 'ER':
448	1	code = code[:-1]
449
450		# Drop non-initial vowels
451	1	code = code[:1] + code[1:].translate(
452		{65: '', 69: '', 73: '', 79: '', 85: '', 89: ''}
453		)
454
455	1	if max_length != -1:
456	1	code = code[:max_length]
457
458	1	return code
459
460
461	1	def henry_early(word, max_length=3):
462		"""Calculate the early version of the Henry code for a word.
463
464		This is a wrapper for :py:meth:`HenryEarly.encode`.
465
466		Args:
467		word (str): The word to transform
468		max_length (int): The length of the code returned (defaults to 3)
469
470		Returns:
471		str: The early Henry code
472
473		Examples:
474		>>> henry_early('Marchand')
475		'MRC'
476		>>> henry_early('Beaulieu')
477		'BL'
478		>>> henry_early('Beaumont')
479		'BM'
480		>>> henry_early('Legrand')
481		'LGR'
482		>>> henry_early('Pelletier')
483		'PLT'
484
485		"""
486	1	return HenryEarly().encode(word, max_length)
487
488
489		if __name__ == '__main__':
490		import doctest
491
492		doctest.testmod()
493

chrislit / abydos

Pull Request — master (#141)

abydos.phonetic._fr.HenryEarly.encode() F

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like