abydos.phonetic._dolby.Dolby.encode_alpha() - Code Metrics - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

abydos.phonetic._dolby.Dolby.encode_alpha() A
last analyzed 2020-12-31 20:10 UTC

↳ Parent: abydos.phonetic._dolby

Complexity

Conditions

Size

Total Lines	32
Code Lines	2

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	4
CRAP Score	1

Importance

Changes

Metric	Value
eloc	2
dl	0
loc	32
ccs	4
cts	4
cp	1
rs	10
c	0
b	0
f	0
cc	1
nop	2
crap	1

# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._dolby.

Dolby Code
"""

from unicodedata import normalize as unicode_normalize

from ._phonetic import _Phonetic

__all__ = ['Dolby']


class Dolby(_Phonetic):
    """Dolby Code.

    This follows "A Spelling Equivalent Abbreviation Algorithm For Personal
    Names" from :cite:`Dolby:1970` and :cite:`Cunningham:1969`.

    .. versionadded:: 0.3.6
    """

    def __init__(
        self,
        max_length: int = -1,
        keep_vowels: bool = False,
        vowel_char: str = '*',
    ) -> None:
        r"""Initialize Dolby instance.

        Parameters
        ----------
        max_length : int
            Maximum length of the returned Dolby code -- this also activates
            the fixed-length code mode if it is greater than 0
        keep_vowels : bool
            If True, retains all vowel markers
        vowel_char : str
            The vowel marker character (default to \*)


        .. versionadded:: 0.4.0

        """
        self._max_length = max_length
        self._keep_vowels = keep_vowels
        self._vowel_char = vowel_char

    def encode_alpha(self, word: str) -> str:
        """Return the alphabetic Dolby Code of a name.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The alphabetic Dolby Code

        Examples
        --------
        >>> pe = Dolby()
        >>> pe.encode_alpha('Hansen')
        'HANSN'
        >>> pe.encode_alpha('Larsen')
        'LARSN'
        >>> pe.encode_alpha('Aagaard')
        'AGR'
        >>> pe.encode_alpha('Braaten')
        'BRADN'
        >>> pe.encode_alpha('Sandvik')
        'SANVK'


        .. versionadded:: 0.4.0

        """
        return self.encode(word).replace(self._vowel_char, 'A')

    def encode(self, word: str) -> str:
        """Return the Dolby Code of a name.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The Dolby Code

        Examples
        --------
        >>> pe = Dolby()
        >>> pe.encode('Hansen')
        'H*NSN'
        >>> pe.encode('Larsen')
        'L*RSN'
        >>> pe.encode('Aagaard')
        '*GR'
        >>> pe.encode('Braaten')
        'BR*DN'
        >>> pe.encode('Sandvik')
        'S*NVK'

        >>> pe_6 = Dolby(max_length=6)
        >>> pe_6.encode('Hansen')
        'H*NS*N'
        >>> pe_6.encode('Larsen')
        'L*RS*N'
        >>> pe_6.encode('Aagaard')
        '*G*R  '
        >>> pe_6.encode('Braaten')
        'BR*D*N'
        >>> pe_6.encode('Sandvik')
        'S*NF*K'

        >>> pe.encode('Smith')
        'SM*D'
        >>> pe.encode('Waters')
        'W*DRS'
        >>> pe.encode('James')
        'J*MS'
        >>> pe.encode('Schmidt')
        'SM*D'
        >>> pe.encode('Ashcroft')
        '*SKRFD'

        >>> pe_6.encode('Smith')
        'SM*D  '
        >>> pe_6.encode('Waters')
        'W*D*RS'
        >>> pe_6.encode('James')
        'J*M*S '
        >>> pe_6.encode('Schmidt')
        'SM*D  '
        >>> pe_6.encode('Ashcroft')
        '*SKRFD'


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        # uppercase, normalize, decompose, and filter non-A-Z out
        word = unicode_normalize('NFKD', word.upper())
        word = ''.join(c for c in word if c in self._uc_set)

        # Rule 1 (FL2)
        if word[:3] in {'MCG', 'MAG', 'MAC'}:
            word = 'MK' + word[3:]
        elif word[:2] == 'MC':
            word = 'MK' + word[2:]

        # Rule 2 (FL3)
        pos = len(word) - 2
        while pos > -1:
            if word[pos : pos + 2] in {
                'DT',
                'LD',
                'ND',
                'NT',
                'RC',
                'RD',
                'RT',
                'SC',
                'SK',
                'ST',
            }:
                word = word[: pos + 1] + word[pos + 2 :]
                pos += 1
            pos -= 1

        # Rule 3 (FL4)
        # Although the rule indicates "after the first letter", the test cases
        # make it clear that these apply to the first letter also.
        word = word.replace('X', 'KS')
        word = word.replace('CE', 'SE')
        word = word.replace('CI', 'SI')
        word = word.replace('CY', 'SI')

        # not in the rule set, but they seem to have intended it
        word = word.replace('TCH', 'CH')

        pos = word.find('CH', 1)
        while pos != -1:
            if word[pos - 1 : pos] not in self._uc_vy_set:
                word = word[:pos] + 'S' + word[pos + 1 :]
            pos = word.find('CH', pos + 1)

        word = word.replace('C', 'K')
        word = word.replace('Z', 'S')

        word = word.replace('WR', 'R')
        word = word.replace('DG', 'G')
        word = word.replace('QU', 'K')
        word = word.replace('T', 'D')
        word = word.replace('PH', 'F')

        # Rule 4 (FL5)
        # Although the rule indicates "after the first letter", the test cases
        # make it clear that these apply to the first letter also.
        pos = word.find('K', 0)
        while pos != -1:
            if pos > 1 and word[pos - 1 : pos] not in self._uc_vy_set | {
                'L',
                'N',
                'R',
            }:
                word = word[: pos - 1] + word[pos:]
                pos -= 1
            pos = word.find('K', pos + 1)

        # Rule FL6
        if self._max_length > 0 and word[-1:] == 'E':
            word = word[:-1]

        # Rule 5 (FL7)
        word = self._delete_consecutive_repeats(word)

        # Rule 6 (FL8)
        if word[:2] == 'PF':
            word = word[1:]
        if word[-2:] == 'PF':
            word = word[:-1]
        elif word[-2:] == 'GH':
            if word[-3:-2] in self._uc_vy_set:
                word = word[:-2] + 'F'
            else:
                word = word[:-2] + 'G'
        word = word.replace('GH', '')

        # Rule FL9
        if self._max_length > 0:
            word = word.replace('V', 'F')

        # Rules 7-9 (FL10-FL12)
        first = 1 + (1 if self._max_length > 0 else 0)
        code = ''
        for pos, char in enumerate(word):
            if char in self._uc_vy_set:
                if first or self._keep_vowels:
                    code += self._vowel_char
                    first -= 1
            elif pos > 0 and char in {'W', 'H'}:
                continue
            else:
                code += char

        if self._max_length > 0:
            # Rule FL13
            if len(code) > self._max_length and code[-1:] == 'S':
                code = code[:-1]
            if self._keep_vowels:
                code = code[: self._max_length]
            else:
                # Rule FL14
                code = code[: self._max_length + 2]
                # Rule FL15
                while len(code) > self._max_length:
                    vowels = len(code) - self._max_length
                    excess = vowels - 1
                    word = code
                    code = ''
                    for char in word:
                        if char == self._vowel_char:
                            if vowels:
                                code += char
                                vowels -= 1
                        else:
                            code += char
                    code = code[: self._max_length + excess]

            # Rule FL16
            code += ' ' * (self._max_length - len(code))

        return code


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# Copyright 2018-2020 by Christopher C. Little.
2		# This file is part of Abydos.
3		#
4		# Abydos is free software: you can redistribute it and/or modify
5		# it under the terms of the GNU General Public License as published by
6		# the Free Software Foundation, either version 3 of the License, or
7		# (at your option) any later version.
8		#
9		# Abydos is distributed in the hope that it will be useful,
10		# but WITHOUT ANY WARRANTY; without even the implied warranty of
11		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12		# GNU General Public License for more details.
13		#
14		# You should have received a copy of the GNU General Public License
15		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17		"""abydos.phonetic._dolby.
18
19	1	Dolby Code
20		"""
21
22		from unicodedata import normalize as unicode_normalize
23
24	1	from ._phonetic import _Phonetic
25
26		__all__ = ['Dolby']
27
28
29		class Dolby(_Phonetic):
30		"""Dolby Code.
31	1
32		This follows "A Spelling Equivalent Abbreviation Algorithm For Personal
33	1	Names" from :cite:`Dolby:1970` and :cite:`Cunningham:1969`.
34
35	1	.. versionadded:: 0.3.6
36		"""
37	1
38	1	def __init__(
39		self,
40	1	max_length: int = -1,
41		keep_vowels: bool = False,
42		vowel_char: str = '*',
43	1	) -> None:
44		r"""Initialize Dolby instance.
45
46		Parameters
47		----------
48		max_length : int
49		Maximum length of the returned Dolby code -- this also activates
50		the fixed-length code mode if it is greater than 0
51		keep_vowels : bool
52	1	If True, retains all vowel markers
53		vowel_char : str
54		The vowel marker character (default to \*)
55
56
57		.. versionadded:: 0.4.0
58
59		"""
60		self._max_length = max_length
61		self._keep_vowels = keep_vowels
62		self._vowel_char = vowel_char
63
64		def encode_alpha(self, word: str) -> str:
65		"""Return the alphabetic Dolby Code of a name.
66
67		Parameters
68		----------
69	1	word : str
70	1	The word to transform
71	1
72		Returns
73	1	-------
74		str
75		The alphabetic Dolby Code
76
77		Examples
78		--------
79		>>> pe = Dolby()
80		>>> pe.encode_alpha('Hansen')
81		'HANSN'
82		>>> pe.encode_alpha('Larsen')
83		'LARSN'
84		>>> pe.encode_alpha('Aagaard')
85		'AGR'
86		>>> pe.encode_alpha('Braaten')
87		'BRADN'
88		>>> pe.encode_alpha('Sandvik')
89		'SANVK'
90
91
92		.. versionadded:: 0.4.0
93
94		"""
95		return self.encode(word).replace(self._vowel_char, 'A')
96
97		def encode(self, word: str) -> str:
98		"""Return the Dolby Code of a name.
99
100		Parameters
101		----------
102		word : str
103		The word to transform
104	1
105		Returns
106	1	-------
107		str
108		The Dolby Code
109
110		Examples
111		--------
112		>>> pe = Dolby()
113		>>> pe.encode('Hansen')
114		'H*NSN'
115		>>> pe.encode('Larsen')
116		'L*RSN'
117		>>> pe.encode('Aagaard')
118		'*GR'
119		>>> pe.encode('Braaten')
120		'BR*DN'
121		>>> pe.encode('Sandvik')
122		'S*NVK'
123
124		>>> pe_6 = Dolby(max_length=6)
125		>>> pe_6.encode('Hansen')
126		'HNSN'
127		>>> pe_6.encode('Larsen')
128		'LRSN'
129		>>> pe_6.encode('Aagaard')
130		'GR '
131		>>> pe_6.encode('Braaten')
132		'BRDN'
133		>>> pe_6.encode('Sandvik')
134		'SNFK'
135
136		>>> pe.encode('Smith')
137		'SM*D'
138		>>> pe.encode('Waters')
139		'W*DRS'
140		>>> pe.encode('James')
141		'J*MS'
142		>>> pe.encode('Schmidt')
143		'SM*D'
144		>>> pe.encode('Ashcroft')
145		'*SKRFD'
146
147		>>> pe_6.encode('Smith')
148		'SM*D '
149		>>> pe_6.encode('Waters')
150		'WDRS'
151		>>> pe_6.encode('James')
152		'JMS '
153		>>> pe_6.encode('Schmidt')
154		'SM*D '
155		>>> pe_6.encode('Ashcroft')
156		'*SKRFD'
157
158
159		.. versionadded:: 0.3.0
160		.. versionchanged:: 0.3.6
161		Encapsulated in class
162
163		"""
164		# uppercase, normalize, decompose, and filter non-A-Z out
165		word = unicode_normalize('NFKD', word.upper())
166		word = ''.join(c for c in word if c in self._uc_set)
167
168		# Rule 1 (FL2)
169		if word[:3] in {'MCG', 'MAG', 'MAC'}:
170		word = 'MK' + word[3:]
171		elif word[:2] == 'MC':
172		word = 'MK' + word[2:]
173
174	1	# Rule 2 (FL3)
175	1	pos = len(word) - 2
176	1	while pos > -1:
177		if word[pos : pos + 2] in {
178		'DT',
179	1	'LD',
180	1	'ND',
181	1	'NT',
182	1	'RC',
183		'RD',
184		'RT',
185	1	'SC',
186	1	'SK',
187	1	'ST',
188		}:
189		word = word[: pos + 1] + word[pos + 2 :]
190		pos += 1
191		pos -= 1
192
193		# Rule 3 (FL4)
194		# Although the rule indicates "after the first letter", the test cases
195		# make it clear that these apply to the first letter also.
196		word = word.replace('X', 'KS')
197		word = word.replace('CE', 'SE')
198		word = word.replace('CI', 'SI')
199	1	word = word.replace('CY', 'SI')
200	1
201	1	# not in the rule set, but they seem to have intended it
202		word = word.replace('TCH', 'CH')
203
204		pos = word.find('CH', 1)
205		while pos != -1:
206	1	if word[pos - 1 : pos] not in self._uc_vy_set:
207	1	word = word[:pos] + 'S' + word[pos + 1 :]
208	1	pos = word.find('CH', pos + 1)
209	1
210		word = word.replace('C', 'K')
211		word = word.replace('Z', 'S')
212	1
213		word = word.replace('WR', 'R')
214	1	word = word.replace('DG', 'G')
215	1	word = word.replace('QU', 'K')
216	1	word = word.replace('T', 'D')
217	1	word = word.replace('PH', 'F')
218	1
219		# Rule 4 (FL5)
220	1	# Although the rule indicates "after the first letter", the test cases
221	1	# make it clear that these apply to the first letter also.
222		pos = word.find('K', 0)
223	1	while pos != -1:
224	1	if pos > 1 and word[pos - 1 : pos] not in self._uc_vy_set \| {
225	1	'L',
226	1	'N',
227	1	'R',
228		}:
229		word = word[: pos - 1] + word[pos:]
230		pos -= 1
231		pos = word.find('K', pos + 1)
232	1
233	1	# Rule FL6
234	1	if self._max_length > 0 and word[-1:] == 'E':
235		word = word[:-1]
236
237		# Rule 5 (FL7)
238		word = self._delete_consecutive_repeats(word)
239	1
240	1	# Rule 6 (FL8)
241	1	if word[:2] == 'PF':
242		word = word[1:]
243		if word[-2:] == 'PF':
244	1	word = word[:-1]
245	1	elif word[-2:] == 'GH':
246		if word[-3:-2] in self._uc_vy_set:
247		word = word[:-2] + 'F'
248	1	else:
249		word = word[:-2] + 'G'
250		word = word.replace('GH', '')
251	1
252	1	# Rule FL9
253	1	if self._max_length > 0:
254	1	word = word.replace('V', 'F')
255	1
256	1	# Rules 7-9 (FL10-FL12)
257	1	first = 1 + (1 if self._max_length > 0 else 0)
258		code = ''
259	1	for pos, char in enumerate(word):
260	1	if char in self._uc_vy_set:
261		if first or self._keep_vowels:
262		code += self._vowel_char
263	1	first -= 1
264	1	elif pos > 0 and char in {'W', 'H'}:
265		continue
266		else:
267	1	code += char
268	1
269	1	if self._max_length > 0:
270	1	# Rule FL13
271	1	if len(code) > self._max_length and code[-1:] == 'S':
272	1	code = code[:-1]
273	1	if self._keep_vowels:
274	1	code = code[: self._max_length]
275	1	else:
276		# Rule FL14
277	1	code = code[: self._max_length + 2]
278		# Rule FL15
279	1	while len(code) > self._max_length:
280		vowels = len(code) - self._max_length
281	1	excess = vowels - 1
282	1	word = code
283	1	code = ''
284	1	for char in word:
285		if char == self._vowel_char:
286		if vowels:
287	1	code += char
288		vowels -= 1
289	1	else:
290	1	code += char
291	1	code = code[: self._max_length + excess]
292	1
293	1	# Rule FL16
294	1	code += ' ' * (self._max_length - len(code))
295	1
296	1	return code
297	1
298	1
299		if __name__ == '__main__':
300	1	import doctest
301	1
302		doctest.testmod()
303

chrislit / abydos

abydos.phonetic._dolby.Dolby.encode_alpha() A last analyzed 2020-12-31 20:10 UTC

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like

abydos.phonetic._dolby.Dolby.encode_alpha() A
last analyzed 2020-12-31 20:10 UTC