Completed
Branch master (78a222)
by Chris
14:36
created

abydos.phonetic._dolby   A

Complexity

Total Complexity 32

Size/Duplication

Total Lines 261
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
wmc 32
eloc 132
dl 0
loc 261
ccs 88
cts 88
cp 1
rs 9.84
c 0
b 0
f 0

1 Function

Rating   Name   Duplication   Size   Complexity  
F dolby() 0 220 32
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._dolby.
20
21
The phonetic._dolby module implements the Dolby Code algorithm.
22
"""
23
24 1
from __future__ import unicode_literals
25
26 1
from unicodedata import normalize as unicode_normalize
27
28 1
from six import text_type
29
30 1
from ._util import _delete_consecutive_repeats
31
32 1
__all__ = ['dolby']
33
34
35 1
def dolby(word, max_length=-1, keep_vowels=False, vowel_char='*'):
36
    r"""Return the Dolby Code of a name.
37
38
    This follows "A Spelling Equivalent Abbreviation Algorithm For Personal
39
    Names" from :cite:`Dolby:1970` and :cite:`Cunningham:1969`.
40
41
    :param word: the word to encode
42
    :param max_length: maximum length of the returned Dolby code -- this also
43
        activates the fixed-length code mode if it is greater than 0
44
    :param keep_vowels: if True, retains all vowel markers
45
    :param vowel_char: the vowel marker character (default to \*)
46
    :returns: the Dolby Code
47
    :rtype: str
48
49
    >>> dolby('Hansen')
50
    'H*NSN'
51
    >>> dolby('Larsen')
52
    'L*RSN'
53
    >>> dolby('Aagaard')
54
    '*GR'
55
    >>> dolby('Braaten')
56
    'BR*DN'
57
    >>> dolby('Sandvik')
58
    'S*NVK'
59
    >>> dolby('Hansen', max_length=6)
60
    'H*NS*N'
61
    >>> dolby('Larsen', max_length=6)
62
    'L*RS*N'
63
    >>> dolby('Aagaard', max_length=6)
64
    '*G*R  '
65
    >>> dolby('Braaten', max_length=6)
66
    'BR*D*N'
67
    >>> dolby('Sandvik', max_length=6)
68
    'S*NF*K'
69
70
    >>> dolby('Smith')
71
    'SM*D'
72
    >>> dolby('Waters')
73
    'W*DRS'
74
    >>> dolby('James')
75
    'J*MS'
76
    >>> dolby('Schmidt')
77
    'SM*D'
78
    >>> dolby('Ashcroft')
79
    '*SKRFD'
80
    >>> dolby('Smith', max_length=6)
81
    'SM*D  '
82
    >>> dolby('Waters', max_length=6)
83
    'W*D*RS'
84
    >>> dolby('James', max_length=6)
85
    'J*M*S '
86
    >>> dolby('Schmidt', max_length=6)
87
    'SM*D  '
88
    >>> dolby('Ashcroft', max_length=6)
89
    '*SKRFD'
90
    """
91 1
    _vowels = {'A', 'E', 'I', 'O', 'U', 'Y'}
92
93
    # uppercase, normalize, decompose, and filter non-A-Z out
94 1
    word = unicode_normalize('NFKD', text_type(word.upper()))
95 1
    word = word.replace('ß', 'SS')
96 1
    word = ''.join(
97
        c
98
        for c in word
99
        if c
100
        in {
101
            'A',
102
            'B',
103
            'C',
104
            'D',
105
            'E',
106
            'F',
107
            'G',
108
            'H',
109
            'I',
110
            'J',
111
            'K',
112
            'L',
113
            'M',
114
            'N',
115
            'O',
116
            'P',
117
            'Q',
118
            'R',
119
            'S',
120
            'T',
121
            'U',
122
            'V',
123
            'W',
124
            'X',
125
            'Y',
126
            'Z',
127
        }
128
    )
129
130
    # Rule 1 (FL2)
131 1
    if word[:3] in {'MCG', 'MAG', 'MAC'}:
132 1
        word = 'MK' + word[3:]
133 1
    elif word[:2] == 'MC':
134 1
        word = 'MK' + word[2:]
135
136
    # Rule 2 (FL3)
137 1
    pos = len(word) - 2
138 1
    while pos > -1:
139 1
        if word[pos : pos + 2] in {
140
            'DT',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
141
            'LD',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
142
            'ND',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
143
            'NT',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
144
            'RC',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
145
            'RD',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
146
            'RT',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
147
            'SC',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
148
            'SK',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
149
            'ST',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
150
        }:
151 1
            word = word[: pos + 1] + word[pos + 2 :]
152 1
            pos += 1
153 1
        pos -= 1
154
155
    # Rule 3 (FL4)
156
    # Although the rule indicates "after the first letter", the test cases make
157
    # it clear that these apply to the first letter also.
158 1
    word = word.replace('X', 'KS')
159 1
    word = word.replace('CE', 'SE')
160 1
    word = word.replace('CI', 'SI')
161 1
    word = word.replace('CY', 'SI')
162
163
    # not in the rule set, but they seem to have intended it
164 1
    word = word.replace('TCH', 'CH')
165
166 1
    pos = word.find('CH', 1)
167 1
    while pos != -1:
168 1
        if word[pos - 1 : pos] not in _vowels:
169 1
            word = word[:pos] + 'S' + word[pos + 1 :]
170 1
        pos = word.find('CH', pos + 1)
171
172 1
    word = word.replace('C', 'K')
173 1
    word = word.replace('Z', 'S')
174
175 1
    word = word.replace('WR', 'R')
176 1
    word = word.replace('DG', 'G')
177 1
    word = word.replace('QU', 'K')
178 1
    word = word.replace('T', 'D')
179 1
    word = word.replace('PH', 'F')
180
181
    # Rule 4 (FL5)
182
    # Although the rule indicates "after the first letter", the test cases make
183
    # it clear that these apply to the first letter also.
184 1
    pos = word.find('K', 0)
185 1
    while pos != -1:
186 1
        if pos > 1 and word[pos - 1 : pos] not in _vowels | {'L', 'N', 'R'}:
187 1
            word = word[: pos - 1] + word[pos:]
188 1
            pos -= 1
189 1
        pos = word.find('K', pos + 1)
190
191
    # Rule FL6
192 1
    if max_length > 0 and word[-1:] == 'E':
193 1
        word = word[:-1]
194
195
    # Rule 5 (FL7)
196 1
    word = _delete_consecutive_repeats(word)
197
198
    # Rule 6 (FL8)
199 1
    if word[:2] == 'PF':
200 1
        word = word[1:]
201 1
    if word[-2:] == 'PF':
202 1
        word = word[:-1]
203 1
    elif word[-2:] == 'GH':
204 1
        if word[-3:-2] in _vowels:
205 1
            word = word[:-2] + 'F'
206
        else:
207 1
            word = word[:-2] + 'G'
208 1
    word = word.replace('GH', '')
209
210
    # Rule FL9
211 1
    if max_length > 0:
212 1
        word = word.replace('V', 'F')
213
214
    # Rules 7-9 (FL10-FL12)
215 1
    first = 1 + (1 if max_length > 0 else 0)
216 1
    code = ''
217 1
    for pos, char in enumerate(word):
218 1
        if char in _vowels:
219 1
            if first or keep_vowels:
220 1
                code += vowel_char
221 1
                first -= 1
222 1
        elif pos > 0 and char in {'W', 'H'}:
223 1
            continue
224
        else:
225 1
            code += char
226
227 1
    if max_length > 0:
0 ignored issues
show
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
228
        # Rule FL13
229 1
        if len(code) > max_length and code[-1:] == 'S':
230 1
            code = code[:-1]
231 1
        if keep_vowels:
232 1
            code = code[:max_length]
233
        else:
234
            # Rule FL14
235 1
            code = code[: max_length + 2]
236
            # Rule FL15
237 1
            while len(code) > max_length:
238 1
                vowels = len(code) - max_length
239 1
                excess = vowels - 1
240 1
                word = code
241 1
                code = ''
242 1
                for char in word:
243 1
                    if char == vowel_char:
244 1
                        if vowels:
245 1
                            code += char
246 1
                            vowels -= 1
247
                    else:
248 1
                        code += char
249 1
                code = code[: max_length + excess]
250
251
        # Rule FL16
252 1
        code += ' ' * (max_length - len(code))
253
254 1
    return code
255
256
257
if __name__ == '__main__':
258
    import doctest
259
260
    doctest.testmod()
261