abydos.phonetic._henry_early.HenryEarly.encode() - Code Metrics - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

abydos.phonetic._henry_early.HenryEarly.encode() F
last analyzed 2020-12-31 20:10 UTC

↳ Parent: abydos.phonetic._henry_early

Complexity

Conditions

Size

Total Lines	185
Code Lines	111

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	91
CRAP Score	56

Importance

Changes

Metric	Value
eloc	111
dl	0
loc	185
ccs	91
cts	91
cp	1
rs	0
c	0
b	0
f	0
cc	56
nop	2
crap	56

How to fix Long Method Complexity

# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._henry_early.

an early version of Henry Code
"""

from unicodedata import normalize as unicode_normalize

from ._phonetic import _Phonetic

__all__ = ['HenryEarly']


class HenryEarly(_Phonetic):
    """Henry code, early version.

    The early version of Henry coding is given in :cite:`Legare:1972`. This is
    different from the later version defined in :cite:`Henry:1976`.

    .. versionadded:: 0.3.6
    """

    _uc_c_set = set('BCDFGHJKLMNPQRSTVWXZ')
    _diph = {
        'AI': 'E',
        'AY': 'E',
        'EI': 'E',
        'AU': 'O',
        'OI': 'O',
        'OU': 'O',
        'EU': 'U',
    }
    _simple = {'W': 'V', 'X': 'S', 'Z': 'S'}

    def __init__(self, max_length: int = 3) -> None:
        """Initialize HenryEarly instance.

        Parameters
        ----------
        max_length : int
            The length of the code returned (defaults to 3)


        .. versionadded:: 0.4.0

        """
        self._max_length = max_length

    def encode(self, word: str) -> str:
        """Calculate the early version of the Henry code for a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The early Henry code

        Examples
        --------
        >>> pe = HenryEarly()
        >>> pe.encode('Marchand')
        'MRC'
        >>> pe.encode('Beaulieu')
        'BL'
        >>> pe.encode('Beaumont')
        'BM'
        >>> pe.encode('Legrand')
        'LGR'
        >>> pe.encode('Pelletier')
        'PLT'


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        word = unicode_normalize('NFKD', word.upper())
        word = ''.join(c for c in word if c in self._uc_set)

        if not word:
            return ''

        # Rule Ia seems to be covered entirely in II

        # Rule Ib
        if word[0] in self._uc_vy_set:
            # Ib1
            if (
                word[1:2] in self._uc_c_set - {'M', 'N'}
                and word[2:3] in self._uc_c_set
            ) or (
                word[1:2] in self._uc_c_set and word[2:3] not in self._uc_c_set
            ):
                if word[0] == 'Y':
                    word = 'I' + word[1:]
            # Ib2
            elif word[1:2] in {'M', 'N'} and word[2:3] in self._uc_c_set:
                if word[0] == 'E':
                    word = 'A' + word[1:]
                elif word[0] in {'I', 'U', 'Y'}:
                    word = 'E' + word[1:]
            # Ib3
            elif word[:2] in self._diph:
                word = self._diph[word[:2]] + word[2:]
            # Ib4
            elif word[1:2] in self._uc_vy_set and word[0] == 'Y':
                word = 'I' + word[1:]

        code = ''
        skip = 0

        # Rule II
        for pos, char in enumerate(word):
            nxch = word[pos + 1 : pos + 2]
            prev = word[pos - 1 : pos]

            if skip:
                skip -= 1
            elif char in self._uc_vy_set:
                code += char
            # IIc
            elif char == nxch:
                skip = 1
                code += char
            elif word[pos : pos + 2] in {'CQ', 'DT', 'SC'}:
                continue
            # IIb
            elif char in self._simple:
                code += self._simple[char]
            elif char in {'C', 'G', 'P', 'Q', 'S'}:
                if char == 'C':
                    if nxch in {'A', 'O', 'U', 'L', 'R'}:
                        code += 'K'
                    elif nxch in {'E', 'I', 'Y'}:
                        code += 'S'
                    elif nxch == 'H':
                        if word[pos + 2 : pos + 3] in self._uc_vy_set:
                            code += 'C'
                        else:  # CHR, CHL, etc.
                            code += 'K'
                    else:
                        code += 'C'
                elif char == 'G':
                    if nxch in {'A', 'O', 'U', 'L', 'R'}:
                        code += 'G'
                    elif nxch in {'E', 'I', 'Y'}:
                        code += 'J'
                    elif nxch == 'N':
                        code += 'N'
                elif char == 'P':
                    if nxch != 'H':
                        code += 'P'
                    else:
                        code += 'F'
                elif char == 'Q':
                    if word[pos + 1 : pos + 3] in {'UE', 'UI', 'UY'}:
                        code += 'G'
                    else:  # QUA, QUO, etc.
                        code += 'K'
                else:  # S...
                    if word[pos : pos + 6] == 'SAINTE':
                        code += 'X'
                        skip = 5
                    elif word[pos : pos + 5] == 'SAINT':
                        code += 'X'
                        skip = 4
                    elif word[pos : pos + 3] == 'STE':
                        code += 'X'
                        skip = 2
                    elif word[pos : pos + 2] == 'ST':
                        code += 'X'
                        skip = 1
                    elif nxch in self._uc_c_set:
                        continue
                    else:
                        code += 'S'
            # IId
            elif char == 'H' and prev in self._uc_c_set:
                continue
            elif char in self._uc_c_set - {
                'L',
                'R',
            } and nxch in self._uc_c_set - {'L', 'R'}:
                continue
            elif char == 'L' and nxch in {'M', 'N'}:
                continue
            elif (
                char in {'M', 'N'}
                and prev in self._uc_vy_set
                and nxch in self._uc_c_set
            ):
                continue
            # IIa
            else:
                code += char

        # IIe1
        if code[-4:] in {'AULT', 'EULT', 'OULT'}:
            code = code[:-2]
        # The following are blocked by rules above
        # elif code[-4:-3] in _vows and code[-3:] == 'MPS':
        #    code = code[:-3]
        # elif code[-3:-2] in _vows and code[-2:] in {'MB', 'MP', 'ND',
        #                                             'NS', 'NT'}:
        #    code = code[:-2]
        elif code[-2:-1] == 'R' and code[-1:] in self._uc_c_set:
            code = code[:-1]
        # IIe2
        elif code[-2:-1] in self._uc_vy_set and code[-1:] in {
            'D',
            'M',
            'N',
            'S',
            'T',
        }:
            code = code[:-1]
        elif code[-2:] == 'ER':
            code = code[:-1]

        # Drop non-initial vowels
        code = code[:1] + code[1:].translate(
            {65: '', 69: '', 73: '', 79: '', 85: '', 89: ''}
        )

        if self._max_length != -1:
            code = code[: self._max_length]

        return code


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# Copyright 2018-2020 by Christopher C. Little.
2		# This file is part of Abydos.
3		#
4		# Abydos is free software: you can redistribute it and/or modify
5		# it under the terms of the GNU General Public License as published by
6		# the Free Software Foundation, either version 3 of the License, or
7		# (at your option) any later version.
8		#
9		# Abydos is distributed in the hope that it will be useful,
10		# but WITHOUT ANY WARRANTY; without even the implied warranty of
11		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12		# GNU General Public License for more details.
13		#
14		# You should have received a copy of the GNU General Public License
15		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17		"""abydos.phonetic._henry_early.
18
19	1	an early version of Henry Code
20		"""
21
22		from unicodedata import normalize as unicode_normalize
23
24	1	from ._phonetic import _Phonetic
25
26		__all__ = ['HenryEarly']
27
28
29		class HenryEarly(_Phonetic):
30		"""Henry code, early version.
31	1
32		The early version of Henry coding is given in :cite:`Legare:1972`. This is
33	1	different from the later version defined in :cite:`Henry:1976`.
34
35	1	.. versionadded:: 0.3.6
36		"""
37	1
38	1	_uc_c_set = set('BCDFGHJKLMNPQRSTVWXZ')
39		_diph = {
40	1	'AI': 'E',
41		'AY': 'E',
42		'EI': 'E',
43	1	'AU': 'O',
44		'OI': 'O',
45		'OU': 'O',
46		'EU': 'U',
47		}
48		_simple = {'W': 'V', 'X': 'S', 'Z': 'S'}
49
50		def __init__(self, max_length: int = 3) -> None:
51		"""Initialize HenryEarly instance.
52	1
53	1	Parameters
54		----------
55		max_length : int
56		The length of the code returned (defaults to 3)
57
58
59		.. versionadded:: 0.4.0
60
61		"""
62	1	self._max_length = max_length
63
64	1	def encode(self, word: str) -> str:
65		"""Calculate the early version of the Henry code for a word.
66
67		Parameters
68		----------
69		word : str
70		The word to transform
71
72		Returns
73		-------
74		str
75		The early Henry code
76	1
77		Examples
78	1	--------
79		>>> pe = HenryEarly()
80		>>> pe.encode('Marchand')
81		'MRC'
82		>>> pe.encode('Beaulieu')
83		'BL'
84		>>> pe.encode('Beaumont')
85		'BM'
86		>>> pe.encode('Legrand')
87		'LGR'
88		>>> pe.encode('Pelletier')
89		'PLT'
90
91
92		.. versionadded:: 0.3.0
93		.. versionchanged:: 0.3.6
94		Encapsulated in class
95
96		"""
97		word = unicode_normalize('NFKD', word.upper())
98		word = ''.join(c for c in word if c in self._uc_set)
99
100		if not word:
101		return ''
102
103		# Rule Ia seems to be covered entirely in II
104
105		# Rule Ib
106		if word[0] in self._uc_vy_set:
107		# Ib1
108		if (
109		word[1:2] in self._uc_c_set - {'M', 'N'}
110		and word[2:3] in self._uc_c_set
111	1	) or (
112	1	word[1:2] in self._uc_c_set and word[2:3] not in self._uc_c_set
113		):
114	1	if word[0] == 'Y':
115	1	word = 'I' + word[1:]
116		# Ib2
117		elif word[1:2] in {'M', 'N'} and word[2:3] in self._uc_c_set:
118		if word[0] == 'E':
119		word = 'A' + word[1:]
120	1	elif word[0] in {'I', 'U', 'Y'}:
121		word = 'E' + word[1:]
122	1	# Ib3
123		elif word[:2] in self._diph:
124		word = self._diph[word[:2]] + word[2:]
125		# Ib4
126		elif word[1:2] in self._uc_vy_set and word[0] == 'Y':
127		word = 'I' + word[1:]
128	1
129	1	code = ''
130		skip = 0
131	1
132	1	# Rule II
133	1	for pos, char in enumerate(word):
134	1	nxch = word[pos + 1 : pos + 2]
135	1	prev = word[pos - 1 : pos]
136
137	1	if skip:
138	1	skip -= 1
139		elif char in self._uc_vy_set:
140	1	code += char
141	1	# IIc
142		elif char == nxch:
143	1	skip = 1
144	1	code += char
145		elif word[pos : pos + 2] in {'CQ', 'DT', 'SC'}:
146		continue
147	1	# IIb
148	1	elif char in self._simple:
149	1	code += self._simple[char]
150		elif char in {'C', 'G', 'P', 'Q', 'S'}:
151	1	if char == 'C':
152	1	if nxch in {'A', 'O', 'U', 'L', 'R'}:
153	1	code += 'K'
154	1	elif nxch in {'E', 'I', 'Y'}:
155		code += 'S'
156	1	elif nxch == 'H':
157	1	if word[pos + 2 : pos + 3] in self._uc_vy_set:
158	1	code += 'C'
159	1	else: # CHR, CHL, etc.
160	1	code += 'K'
161		else:
162	1	code += 'C'
163	1	elif char == 'G':
164	1	if nxch in {'A', 'O', 'U', 'L', 'R'}:
165	1	code += 'G'
166	1	elif nxch in {'E', 'I', 'Y'}:
167	1	code += 'J'
168	1	elif nxch == 'N':
169	1	code += 'N'
170	1	elif char == 'P':
171	1	if nxch != 'H':
172	1	code += 'P'
173		else:
174	1	code += 'F'
175		elif char == 'Q':
176	1	if word[pos + 1 : pos + 3] in {'UE', 'UI', 'UY'}:
177	1	code += 'G'
178	1	else: # QUA, QUO, etc.
179	1	code += 'K'
180	1	else: # S...
181	1	if word[pos : pos + 6] == 'SAINTE':
182	1	code += 'X'
183	1	skip = 5
184	1	elif word[pos : pos + 5] == 'SAINT':
185	1	code += 'X'
186	1	skip = 4
187		elif word[pos : pos + 3] == 'STE':
188	1	code += 'X'
189	1	skip = 2
190	1	elif word[pos : pos + 2] == 'ST':
191	1	code += 'X'
192		skip = 1
193	1	elif nxch in self._uc_c_set:
194		continue
195	1	else:
196	1	code += 'S'
197	1	# IId
198	1	elif char == 'H' and prev in self._uc_c_set:
199	1	continue
200	1	elif char in self._uc_c_set - {
201	1	'L',
202	1	'R',
203	1	} and nxch in self._uc_c_set - {'L', 'R'}:
204	1	continue
205	1	elif char == 'L' and nxch in {'M', 'N'}:
206	1	continue
207	1	elif (
208	1	char in {'M', 'N'}
209		and prev in self._uc_vy_set
210	1	and nxch in self._uc_c_set
211		):
212	1	continue
213	1	# IIa
214	1	else:
215		code += char
216
217		# IIe1
218	1	if code[-4:] in {'AULT', 'EULT', 'OULT'}:
219	1	code = code[:-2]
220	1	# The following are blocked by rules above
221	1	# elif code[-4:-3] in _vows and code[-3:] == 'MPS':
222		# code = code[:-3]
223		# elif code[-3:-2] in _vows and code[-2:] in {'MB', 'MP', 'ND',
224		# 'NS', 'NT'}:
225		# code = code[:-2]
226	1	elif code[-2:-1] == 'R' and code[-1:] in self._uc_c_set:
227		code = code[:-1]
228		# IIe2
229	1	elif code[-2:-1] in self._uc_vy_set and code[-1:] in {
230		'D',
231		'M',
232	1	'N',
233	1	'S',
234		'T',
235		}:
236		code = code[:-1]
237		elif code[-2:] == 'ER':
238		code = code[:-1]
239
240	1	# Drop non-initial vowels
241	1	code = code[:1] + code[1:].translate(
242		{65: '', 69: '', 73: '', 79: '', 85: '', 89: ''}
243	1	)
244
245		if self._max_length != -1:
246		code = code[: self._max_length]
247
248		return code
249
250	1
251	1	if __name__ == '__main__':
252	1	import doctest
253
254		doctest.testmod()
255

chrislit / abydos

abydos.phonetic._henry_early.HenryEarly.encode() F last analyzed 2020-12-31 20:10 UTC

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like

abydos.phonetic._henry_early.HenryEarly.encode() F
last analyzed 2020-12-31 20:10 UTC