abydos.phonetic._dolby - Code Metrics - Inspection of "78a222a9f7d8976f6744d263e3d6d01a2a991c27" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Branch — master (78a222)

by Chris

created 2018-10-26 11:30 UTC

abydos.phonetic._dolby A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	261
Duplicated Lines	0 %

Test Coverage

Coverage

100%

Importance

Changes

Metric	Value
wmc	32
eloc	132
dl	0
loc	261
ccs	88
cts	88
cp	1
rs	9.84
c	0
b	0
f	0

1 Function

Rating	Name	Duplication	Size	Complexity
F	dolby()	0	220	32

# -*- coding: utf-8 -*-

# Copyright 2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._dolby.

The phonetic._dolby module implements the Dolby Code algorithm.
"""

from __future__ import unicode_literals

from unicodedata import normalize as unicode_normalize

from six import text_type

from ._util import _delete_consecutive_repeats

__all__ = ['dolby']


def dolby(word, max_length=-1, keep_vowels=False, vowel_char='*'):
    r"""Return the Dolby Code of a name.

    This follows "A Spelling Equivalent Abbreviation Algorithm For Personal
    Names" from :cite:`Dolby:1970` and :cite:`Cunningham:1969`.

    :param word: the word to encode
    :param max_length: maximum length of the returned Dolby code -- this also
        activates the fixed-length code mode if it is greater than 0
    :param keep_vowels: if True, retains all vowel markers
    :param vowel_char: the vowel marker character (default to \*)
    :returns: the Dolby Code
    :rtype: str

    >>> dolby('Hansen')
    'H*NSN'
    >>> dolby('Larsen')
    'L*RSN'
    >>> dolby('Aagaard')
    '*GR'
    >>> dolby('Braaten')
    'BR*DN'
    >>> dolby('Sandvik')
    'S*NVK'
    >>> dolby('Hansen', max_length=6)
    'H*NS*N'
    >>> dolby('Larsen', max_length=6)
    'L*RS*N'
    >>> dolby('Aagaard', max_length=6)
    '*G*R  '
    >>> dolby('Braaten', max_length=6)
    'BR*D*N'
    >>> dolby('Sandvik', max_length=6)
    'S*NF*K'

    >>> dolby('Smith')
    'SM*D'
    >>> dolby('Waters')
    'W*DRS'
    >>> dolby('James')
    'J*MS'
    >>> dolby('Schmidt')
    'SM*D'
    >>> dolby('Ashcroft')
    '*SKRFD'
    >>> dolby('Smith', max_length=6)
    'SM*D  '
    >>> dolby('Waters', max_length=6)
    'W*D*RS'
    >>> dolby('James', max_length=6)
    'J*M*S '
    >>> dolby('Schmidt', max_length=6)
    'SM*D  '
    >>> dolby('Ashcroft', max_length=6)
    '*SKRFD'
    """
    _vowels = {'A', 'E', 'I', 'O', 'U', 'Y'}

    # uppercase, normalize, decompose, and filter non-A-Z out
    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.replace('ß', 'SS')
    word = ''.join(
        c
        for c in word
        if c
        in {
            'A',
            'B',
            'C',
            'D',
            'E',
            'F',
            'G',
            'H',
            'I',
            'J',
            'K',
            'L',
            'M',
            'N',
            'O',
            'P',
            'Q',
            'R',
            'S',
            'T',
            'U',
            'V',
            'W',
            'X',
            'Y',
            'Z',
        }
    )

    # Rule 1 (FL2)
    if word[:3] in {'MCG', 'MAG', 'MAC'}:
        word = 'MK' + word[3:]
    elif word[:2] == 'MC':
        word = 'MK' + word[2:]

    # Rule 2 (FL3)
    pos = len(word) - 2
    while pos > -1:
        if word[pos : pos + 2] in {
            'DT',

            'LD',

            'ND',

            'NT',

            'RC',

            'RD',

            'RT',

            'SC',

            'SK',

            'ST',

        }:
            word = word[: pos + 1] + word[pos + 2 :]
            pos += 1
        pos -= 1

    # Rule 3 (FL4)
    # Although the rule indicates "after the first letter", the test cases make
    # it clear that these apply to the first letter also.
    word = word.replace('X', 'KS')
    word = word.replace('CE', 'SE')
    word = word.replace('CI', 'SI')
    word = word.replace('CY', 'SI')

    # not in the rule set, but they seem to have intended it
    word = word.replace('TCH', 'CH')

    pos = word.find('CH', 1)
    while pos != -1:
        if word[pos - 1 : pos] not in _vowels:
            word = word[:pos] + 'S' + word[pos + 1 :]
        pos = word.find('CH', pos + 1)

    word = word.replace('C', 'K')
    word = word.replace('Z', 'S')

    word = word.replace('WR', 'R')
    word = word.replace('DG', 'G')
    word = word.replace('QU', 'K')
    word = word.replace('T', 'D')
    word = word.replace('PH', 'F')

    # Rule 4 (FL5)
    # Although the rule indicates "after the first letter", the test cases make
    # it clear that these apply to the first letter also.
    pos = word.find('K', 0)
    while pos != -1:
        if pos > 1 and word[pos - 1 : pos] not in _vowels | {'L', 'N', 'R'}:
            word = word[: pos - 1] + word[pos:]
            pos -= 1
        pos = word.find('K', pos + 1)

    # Rule FL6
    if max_length > 0 and word[-1:] == 'E':
        word = word[:-1]

    # Rule 5 (FL7)
    word = _delete_consecutive_repeats(word)

    # Rule 6 (FL8)
    if word[:2] == 'PF':
        word = word[1:]
    if word[-2:] == 'PF':
        word = word[:-1]
    elif word[-2:] == 'GH':
        if word[-3:-2] in _vowels:
            word = word[:-2] + 'F'
        else:
            word = word[:-2] + 'G'
    word = word.replace('GH', '')

    # Rule FL9
    if max_length > 0:
        word = word.replace('V', 'F')

    # Rules 7-9 (FL10-FL12)
    first = 1 + (1 if max_length > 0 else 0)
    code = ''
    for pos, char in enumerate(word):
        if char in _vowels:
            if first or keep_vowels:
                code += vowel_char
                first -= 1
        elif pos > 0 and char in {'W', 'H'}:
            continue
        else:
            code += char

    if max_length > 0:

        # Rule FL13
        if len(code) > max_length and code[-1:] == 'S':
            code = code[:-1]
        if keep_vowels:
            code = code[:max_length]
        else:
            # Rule FL14
            code = code[: max_length + 2]
            # Rule FL15
            while len(code) > max_length:
                vowels = len(code) - max_length
                excess = vowels - 1
                word = code
                code = ''
                for char in word:
                    if char == vowel_char:
                        if vowels:
                            code += char
                            vowels -= 1
                    else:
                        code += char
                code = code[: max_length + excess]

        # Rule FL16
        code += ' ' * (max_length - len(code))

    return code


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.phonetic._dolby.
20
21		The phonetic._dolby module implements the Dolby Code algorithm.
22		"""
23
24	1	from __future__ import unicode_literals
25
26	1	from unicodedata import normalize as unicode_normalize
27
28	1	from six import text_type
29
30	1	from ._util import _delete_consecutive_repeats
31
32	1	__all__ = ['dolby']
33
34
35	1	def dolby(word, max_length=-1, keep_vowels=False, vowel_char='*'):
36		r"""Return the Dolby Code of a name.
37
38		This follows "A Spelling Equivalent Abbreviation Algorithm For Personal
39		Names" from :cite:`Dolby:1970` and :cite:`Cunningham:1969`.
40
41		:param word: the word to encode
42		:param max_length: maximum length of the returned Dolby code -- this also
43		activates the fixed-length code mode if it is greater than 0
44		:param keep_vowels: if True, retains all vowel markers
45		:param vowel_char: the vowel marker character (default to \*)
46		:returns: the Dolby Code
47		:rtype: str
48
49		>>> dolby('Hansen')
50		'H*NSN'
51		>>> dolby('Larsen')
52		'L*RSN'
53		>>> dolby('Aagaard')
54		'*GR'
55		>>> dolby('Braaten')
56		'BR*DN'
57		>>> dolby('Sandvik')
58		'S*NVK'
59		>>> dolby('Hansen', max_length=6)
60		'HNSN'
61		>>> dolby('Larsen', max_length=6)
62		'LRSN'
63		>>> dolby('Aagaard', max_length=6)
64		'GR '
65		>>> dolby('Braaten', max_length=6)
66		'BRDN'
67		>>> dolby('Sandvik', max_length=6)
68		'SNFK'
69
70		>>> dolby('Smith')
71		'SM*D'
72		>>> dolby('Waters')
73		'W*DRS'
74		>>> dolby('James')
75		'J*MS'
76		>>> dolby('Schmidt')
77		'SM*D'
78		>>> dolby('Ashcroft')
79		'*SKRFD'
80		>>> dolby('Smith', max_length=6)
81		'SM*D '
82		>>> dolby('Waters', max_length=6)
83		'WDRS'
84		>>> dolby('James', max_length=6)
85		'JMS '
86		>>> dolby('Schmidt', max_length=6)
87		'SM*D '
88		>>> dolby('Ashcroft', max_length=6)
89		'*SKRFD'
90		"""
91	1	_vowels = {'A', 'E', 'I', 'O', 'U', 'Y'}
92
93		# uppercase, normalize, decompose, and filter non-A-Z out
94	1	word = unicode_normalize('NFKD', text_type(word.upper()))
95	1	word = word.replace('ß', 'SS')
96	1	word = ''.join(
97		c
98		for c in word
99		if c
100		in {
101		'A',
102		'B',
103		'C',
104		'D',
105		'E',
106		'F',
107		'G',
108		'H',
109		'I',
110		'J',
111		'K',
112		'L',
113		'M',
114		'N',
115		'O',
116		'P',
117		'Q',
118		'R',
119		'S',
120		'T',
121		'U',
122		'V',
123		'W',
124		'X',
125		'Y',
126		'Z',
127		}
128		)
129
130		# Rule 1 (FL2)
131	1	if word[:3] in {'MCG', 'MAG', 'MAC'}:
132	1	word = 'MK' + word[3:]
133	1	elif word[:2] == 'MC':
134	1	word = 'MK' + word[2:]
135
136		# Rule 2 (FL3)
137	1	pos = len(word) - 2
138	1	while pos > -1:
139	1	if word[pos : pos + 2] in {
140		'DT',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
141		'LD',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
142		'ND',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
143		'NT',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
144		'RC',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
145		'RD',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
146		'RT',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
147		'SC',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
148		'SK',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
149		'ST',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
150		}:
151	1	word = word[: pos + 1] + word[pos + 2 :]
152	1	pos += 1
153	1	pos -= 1
154
155		# Rule 3 (FL4)
156		# Although the rule indicates "after the first letter", the test cases make
157		# it clear that these apply to the first letter also.
158	1	word = word.replace('X', 'KS')
159	1	word = word.replace('CE', 'SE')
160	1	word = word.replace('CI', 'SI')
161	1	word = word.replace('CY', 'SI')
162
163		# not in the rule set, but they seem to have intended it
164	1	word = word.replace('TCH', 'CH')
165
166	1	pos = word.find('CH', 1)
167	1	while pos != -1:
168	1	if word[pos - 1 : pos] not in _vowels:
169	1	word = word[:pos] + 'S' + word[pos + 1 :]
170	1	pos = word.find('CH', pos + 1)
171
172	1	word = word.replace('C', 'K')
173	1	word = word.replace('Z', 'S')
174
175	1	word = word.replace('WR', 'R')
176	1	word = word.replace('DG', 'G')
177	1	word = word.replace('QU', 'K')
178	1	word = word.replace('T', 'D')
179	1	word = word.replace('PH', 'F')
180
181		# Rule 4 (FL5)
182		# Although the rule indicates "after the first letter", the test cases make
183		# it clear that these apply to the first letter also.
184	1	pos = word.find('K', 0)
185	1	while pos != -1:
186	1	if pos > 1 and word[pos - 1 : pos] not in _vowels \| {'L', 'N', 'R'}:
187	1	word = word[: pos - 1] + word[pos:]
188	1	pos -= 1
189	1	pos = word.find('K', pos + 1)
190
191		# Rule FL6
192	1	if max_length > 0 and word[-1:] == 'E':
193	1	word = word[:-1]
194
195		# Rule 5 (FL7)
196	1	word = _delete_consecutive_repeats(word)
197
198		# Rule 6 (FL8)
199	1	if word[:2] == 'PF':
200	1	word = word[1:]
201	1	if word[-2:] == 'PF':
202	1	word = word[:-1]
203	1	elif word[-2:] == 'GH':
204	1	if word[-3:-2] in _vowels:
205	1	word = word[:-2] + 'F'
206		else:
207	1	word = word[:-2] + 'G'
208	1	word = word.replace('GH', '')
209
210		# Rule FL9
211	1	if max_length > 0:
212	1	word = word.replace('V', 'F')
213
214		# Rules 7-9 (FL10-FL12)
215	1	first = 1 + (1 if max_length > 0 else 0)
216	1	code = ''
217	1	for pos, char in enumerate(word):
218	1	if char in _vowels:
219	1	if first or keep_vowels:
220	1	code += vowel_char
221	1	first -= 1
222	1	elif pos > 0 and char in {'W', 'H'}:
223	1	continue
224		else:
225	1	code += char
226
227	1	if max_length > 0:
		0 ignored issues – show unused-code introduced 2018-10-20 00:45 UTC by Report Bug Copy Issue Report Too many nested blocks (6/5) Loading history...
228		# Rule FL13
229	1	if len(code) > max_length and code[-1:] == 'S':
230	1	code = code[:-1]
231	1	if keep_vowels:
232	1	code = code[:max_length]
233		else:
234		# Rule FL14
235	1	code = code[: max_length + 2]
236		# Rule FL15
237	1	while len(code) > max_length:
238	1	vowels = len(code) - max_length
239	1	excess = vowels - 1
240	1	word = code
241	1	code = ''
242	1	for char in word:
243	1	if char == vowel_char:
244	1	if vowels:
245	1	code += char
246	1	vowels -= 1
247		else:
248	1	code += char
249	1	code = code[: max_length + excess]
250
251		# Rule FL16
252	1	code += ' ' * (max_length - len(code))
253
254	1	return code
255
256
257		if __name__ == '__main__':
258		import doctest
259
260		doctest.testmod()
261

chrislit / abydos

Branch — master (78a222)

abydos.phonetic._dolby A

Complexity

Size/Duplication

Test Coverage

Importance

1 Function

Duplication Side-by-Side

Filter issues like