abydos.phonetic.dolby - Code Metrics - Inspection of "Merge pull request #120 from chrislit/modularize" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Test Failed

Push — master ( 64abe2...a464fa )

by Chris

created 2018-10-19 22:32 UTC

abydos.phonetic.dolby A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	221
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	96
dl	0
loc	221
rs	9.84
c	0
b	0
f	0
wmc	32

1 Function

Rating	Name	Duplication	Size	Complexity
F	dolby()	0	181	32

# -*- coding: utf-8 -*-

# Copyright 2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic.dolby.

The phonetic.dolby module implements the Dolby Code algorithm.
"""

from __future__ import unicode_literals

from unicodedata import normalize as unicode_normalize

from six import text_type

from . import _delete_consecutive_repeats

__all__ = ['dolby']


def dolby(word, max_length=-1, keep_vowels=False, vowel_char='*'):
    r"""Return the Dolby Code of a name.

    This follows "A Spelling Equivalent Abbreviation Algorithm For Personal
    Names" from :cite:`Dolby:1970` and :cite:`Cunningham:1969`.

    :param word: the word to encode
    :param max_length: maximum length of the returned Dolby code -- this also
        activates the fixed-length code mode if it is greater than 0
    :param keep_vowels: if True, retains all vowel markers
    :param vowel_char: the vowel marker character (default to \*)
    :returns: the Dolby Code
    :rtype: str

    >>> dolby('Hansen')
    'H*NSN'
    >>> dolby('Larsen')
    'L*RSN'
    >>> dolby('Aagaard')
    '*GR'
    >>> dolby('Braaten')
    'BR*DN'
    >>> dolby('Sandvik')
    'S*NVK'
    >>> dolby('Hansen', max_length=6)
    'H*NS*N'
    >>> dolby('Larsen', max_length=6)
    'L*RS*N'
    >>> dolby('Aagaard', max_length=6)
    '*G*R  '
    >>> dolby('Braaten', max_length=6)
    'BR*D*N'
    >>> dolby('Sandvik', max_length=6)
    'S*NF*K'

    >>> dolby('Smith')
    'SM*D'
    >>> dolby('Waters')
    'W*DRS'
    >>> dolby('James')
    'J*MS'
    >>> dolby('Schmidt')
    'SM*D'
    >>> dolby('Ashcroft')
    '*SKRFD'
    >>> dolby('Smith', max_length=6)
    'SM*D  '
    >>> dolby('Waters', max_length=6)
    'W*D*RS'
    >>> dolby('James', max_length=6)
    'J*M*S '
    >>> dolby('Schmidt', max_length=6)
    'SM*D  '
    >>> dolby('Ashcroft', max_length=6)
    '*SKRFD'
    """
    _vowels = {'A', 'E', 'I', 'O', 'U', 'Y'}

    # uppercase, normalize, decompose, and filter non-A-Z out
    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.replace('ß', 'SS')
    word = ''.join(c for c in word if c in
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                    'Y', 'Z'})

    # Rule 1 (FL2)
    if word[:3] in {'MCG', 'MAG', 'MAC'}:
        word = 'MK'+word[3:]
    elif word[:2] == 'MC':
        word = 'MK'+word[2:]

    # Rule 2 (FL3)
    pos = len(word)-2
    while pos > -1:
        if word[pos:pos+2] in {'DT', 'LD', 'ND', 'NT', 'RC', 'RD', 'RT', 'SC',
                               'SK', 'ST'}:
            word = word[:pos+1]+word[pos+2:]
            pos += 1
        pos -= 1

    # Rule 3 (FL4)
    # Although the rule indicates "after the first letter", the test cases make
    # it clear that these apply to the first letter also.
    word = word.replace('X', 'KS')
    word = word.replace('CE', 'SE')
    word = word.replace('CI', 'SI')
    word = word.replace('CY', 'SI')

    # not in the rule set, but they seem to have intended it
    word = word.replace('TCH', 'CH')

    pos = word.find('CH', 1)
    while pos != -1:
        if word[pos-1:pos] not in _vowels:
            word = word[:pos]+'S'+word[pos+1:]
        pos = word.find('CH', pos+1)

    word = word.replace('C', 'K')
    word = word.replace('Z', 'S')

    word = word.replace('WR', 'R')
    word = word.replace('DG', 'G')
    word = word.replace('QU', 'K')
    word = word.replace('T', 'D')
    word = word.replace('PH', 'F')

    # Rule 4 (FL5)
    # Although the rule indicates "after the first letter", the test cases make
    # it clear that these apply to the first letter also.
    pos = word.find('K', 0)
    while pos != -1:
        if pos > 1 and word[pos-1:pos] not in _vowels | {'L', 'N', 'R'}:
            word = word[:pos-1]+word[pos:]
            pos -= 1
        pos = word.find('K', pos+1)

    # Rule FL6
    if max_length > 0 and word[-1:] == 'E':
        word = word[:-1]

    # Rule 5 (FL7)
    word = _delete_consecutive_repeats(word)

    # Rule 6 (FL8)
    if word[:2] == 'PF':
        word = word[1:]
    if word[-2:] == 'PF':
        word = word[:-1]
    elif word[-2:] == 'GH':
        if word[-3:-2] in _vowels:
            word = word[:-2]+'F'
        else:
            word = word[:-2]+'G'
    word = word.replace('GH', '')

    # Rule FL9
    if max_length > 0:
        word = word.replace('V', 'F')

    # Rules 7-9 (FL10-FL12)
    first = 1 + (1 if max_length > 0 else 0)
    code = ''
    for pos, char in enumerate(word):
        if char in _vowels:
            if first or keep_vowels:
                code += vowel_char
                first -= 1
        elif pos > 0 and char in {'W', 'H'}:
            continue
        else:
            code += char

    if max_length > 0:
        # Rule FL13
        if len(code) > max_length and code[-1:] == 'S':
            code = code[:-1]
        if keep_vowels:
            code = code[:max_length]
        else:
            # Rule FL14
            code = code[:max_length + 2]
            # Rule FL15
            while len(code) > max_length:
                vowels = len(code) - max_length
                excess = vowels - 1
                word = code
                code = ''
                for char in word:
                    if char == vowel_char:
                        if vowels:
                            code += char
                            vowels -= 1
                    else:
                        code += char
                code = code[:max_length + excess]

        # Rule FL16
        code += ' ' * (max_length - len(code))

    return code


if __name__ == '__main__':
    import doctest
    doctest.testmod()


1			# -- coding: utf-8 --
2
3			# Copyright 2018 by Christopher C. Little.
4			# This file is part of Abydos.
5			#
6			# Abydos is free software: you can redistribute it and/or modify
7			# it under the terms of the GNU General Public License as published by
8			# the Free Software Foundation, either version 3 of the License, or
9			# (at your option) any later version.
10			#
11			# Abydos is distributed in the hope that it will be useful,
12			# but WITHOUT ANY WARRANTY; without even the implied warranty of
13			# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14			# GNU General Public License for more details.
15			#
16			# You should have received a copy of the GNU General Public License
17			# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19			"""abydos.phonetic.dolby.
20
21			The phonetic.dolby module implements the Dolby Code algorithm.
22			"""
23
24			from __future__ import unicode_literals
25
26			from unicodedata import normalize as unicode_normalize
27
28			from six import text_type
29
30			from . import _delete_consecutive_repeats
31
32			__all__ = ['dolby']
33
34
35			def dolby(word, max_length=-1, keep_vowels=False, vowel_char='*'):
36			r"""Return the Dolby Code of a name.
37
38			This follows "A Spelling Equivalent Abbreviation Algorithm For Personal
39			Names" from :cite:`Dolby:1970` and :cite:`Cunningham:1969`.
40
41			:param word: the word to encode
42			:param max_length: maximum length of the returned Dolby code -- this also
43			activates the fixed-length code mode if it is greater than 0
44			:param keep_vowels: if True, retains all vowel markers
45			:param vowel_char: the vowel marker character (default to \*)
46			:returns: the Dolby Code
47			:rtype: str
48
49			>>> dolby('Hansen')
50			'H*NSN'
51			>>> dolby('Larsen')
52			'L*RSN'
53			>>> dolby('Aagaard')
54			'*GR'
55			>>> dolby('Braaten')
56			'BR*DN'
57			>>> dolby('Sandvik')
58			'S*NVK'
59			>>> dolby('Hansen', max_length=6)
60			'HNSN'
61			>>> dolby('Larsen', max_length=6)
62			'LRSN'
63			>>> dolby('Aagaard', max_length=6)
64			'GR '
65			>>> dolby('Braaten', max_length=6)
66			'BRDN'
67			>>> dolby('Sandvik', max_length=6)
68			'SNFK'
69
70			>>> dolby('Smith')
71			'SM*D'
72			>>> dolby('Waters')
73			'W*DRS'
74			>>> dolby('James')
75			'J*MS'
76			>>> dolby('Schmidt')
77			'SM*D'
78			>>> dolby('Ashcroft')
79			'*SKRFD'
80			>>> dolby('Smith', max_length=6)
81			'SM*D '
82			>>> dolby('Waters', max_length=6)
83			'WDRS'
84			>>> dolby('James', max_length=6)
85			'JMS '
86			>>> dolby('Schmidt', max_length=6)
87			'SM*D '
88			>>> dolby('Ashcroft', max_length=6)
89			'*SKRFD'
90			"""
91			_vowels = {'A', 'E', 'I', 'O', 'U', 'Y'}
92
93			# uppercase, normalize, decompose, and filter non-A-Z out
94			word = unicode_normalize('NFKD', text_type(word.upper()))
95			word = word.replace('ß', 'SS')
96			word = ''.join(c for c in word if c in
97			{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
98			'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
99			'Y', 'Z'})
100
101			# Rule 1 (FL2)
102			if word[:3] in {'MCG', 'MAG', 'MAC'}:
103			word = 'MK'+word[3:]
104			elif word[:2] == 'MC':
105			word = 'MK'+word[2:]
106
107			# Rule 2 (FL3)
108			pos = len(word)-2
109			while pos > -1:
110			if word[pos:pos+2] in {'DT', 'LD', 'ND', 'NT', 'RC', 'RD', 'RT', 'SC',
111			'SK', 'ST'}:
112			word = word[:pos+1]+word[pos+2:]
113			pos += 1
114			pos -= 1
115
116			# Rule 3 (FL4)
117			# Although the rule indicates "after the first letter", the test cases make
118			# it clear that these apply to the first letter also.
119			word = word.replace('X', 'KS')
120			word = word.replace('CE', 'SE')
121			word = word.replace('CI', 'SI')
122			word = word.replace('CY', 'SI')
123
124			# not in the rule set, but they seem to have intended it
125			word = word.replace('TCH', 'CH')
126
127			pos = word.find('CH', 1)
128			while pos != -1:
129			if word[pos-1:pos] not in _vowels:
130			word = word[:pos]+'S'+word[pos+1:]
131			pos = word.find('CH', pos+1)
132
133			word = word.replace('C', 'K')
134			word = word.replace('Z', 'S')
135
136			word = word.replace('WR', 'R')
137			word = word.replace('DG', 'G')
138			word = word.replace('QU', 'K')
139			word = word.replace('T', 'D')
140			word = word.replace('PH', 'F')
141
142			# Rule 4 (FL5)
143			# Although the rule indicates "after the first letter", the test cases make
144			# it clear that these apply to the first letter also.
145			pos = word.find('K', 0)
146			while pos != -1:
147			if pos > 1 and word[pos-1:pos] not in _vowels \| {'L', 'N', 'R'}:
148			word = word[:pos-1]+word[pos:]
149			pos -= 1
150			pos = word.find('K', pos+1)
151
152			# Rule FL6
153			if max_length > 0 and word[-1:] == 'E':
154			word = word[:-1]
155
156			# Rule 5 (FL7)
157			word = _delete_consecutive_repeats(word)
158
159			# Rule 6 (FL8)
160			if word[:2] == 'PF':
161			word = word[1:]
162			if word[-2:] == 'PF':
163			word = word[:-1]
164			elif word[-2:] == 'GH':
165			if word[-3:-2] in _vowels:
166			word = word[:-2]+'F'
167			else:
168			word = word[:-2]+'G'
169			word = word.replace('GH', '')
170
171			# Rule FL9
172			if max_length > 0:
173			word = word.replace('V', 'F')
174
175			# Rules 7-9 (FL10-FL12)
176			first = 1 + (1 if max_length > 0 else 0)
177			code = ''
178			for pos, char in enumerate(word):
179			if char in _vowels:
180			if first or keep_vowels:
181			code += vowel_char
182			first -= 1
183			elif pos > 0 and char in {'W', 'H'}:
184			continue
185			else:
186			code += char
187
188			if max_length > 0:
189			# Rule FL13
190			if len(code) > max_length and code[-1:] == 'S':
191			code = code[:-1]
192			if keep_vowels:
193			code = code[:max_length]
194			else:
195			# Rule FL14
196			code = code[:max_length + 2]
197			# Rule FL15
198			while len(code) > max_length:
199			vowels = len(code) - max_length
200			excess = vowels - 1
201			word = code
202			code = ''
203			for char in word:
204			if char == vowel_char:
205			if vowels:
206			code += char
207			vowels -= 1
208			else:
209			code += char
210			code = code[:max_length + excess]
211
212			# Rule FL16
213			code += ' ' * (max_length - len(code))
214
215			return code
216
217
218			if __name__ == '__main__':
219			import doctest
220			doctest.testmod()
221

chrislit / abydos

Push — master ( 64abe2...a464fa )

abydos.phonetic.dolby A

Complexity

Size/Duplication

Importance

1 Function

Duplication Side-by-Side

Filter issues like