abydos.phonetic._dolby.Dolby.encode_alpha()   A
last analyzed

Complexity

Conditions 1

Size

Total Lines 32
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 4
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 32
ccs 4
cts 4
cp 1
rs 10
c 0
b 0
f 0
cc 1
nop 2
crap 1
1
# Copyright 2018-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.phonetic._dolby.
18
19 1
Dolby Code
20
"""
21
22
from unicodedata import normalize as unicode_normalize
23
24 1
from ._phonetic import _Phonetic
25
26
__all__ = ['Dolby']
27
28
29
class Dolby(_Phonetic):
30
    """Dolby Code.
31 1
32
    This follows "A Spelling Equivalent Abbreviation Algorithm For Personal
33 1
    Names" from :cite:`Dolby:1970` and :cite:`Cunningham:1969`.
34
35 1
    .. versionadded:: 0.3.6
36
    """
37 1
38 1
    def __init__(
39
        self,
40 1
        max_length: int = -1,
41
        keep_vowels: bool = False,
42
        vowel_char: str = '*',
43 1
    ) -> None:
44
        r"""Initialize Dolby instance.
45
46
        Parameters
47
        ----------
48
        max_length : int
49
            Maximum length of the returned Dolby code -- this also activates
50
            the fixed-length code mode if it is greater than 0
51
        keep_vowels : bool
52 1
            If True, retains all vowel markers
53
        vowel_char : str
54
            The vowel marker character (default to \*)
55
56
57
        .. versionadded:: 0.4.0
58
59
        """
60
        self._max_length = max_length
61
        self._keep_vowels = keep_vowels
62
        self._vowel_char = vowel_char
63
64
    def encode_alpha(self, word: str) -> str:
65
        """Return the alphabetic Dolby Code of a name.
66
67
        Parameters
68
        ----------
69 1
        word : str
70 1
            The word to transform
71 1
72
        Returns
73 1
        -------
74
        str
75
            The alphabetic Dolby Code
76
77
        Examples
78
        --------
79
        >>> pe = Dolby()
80
        >>> pe.encode_alpha('Hansen')
81
        'HANSN'
82
        >>> pe.encode_alpha('Larsen')
83
        'LARSN'
84
        >>> pe.encode_alpha('Aagaard')
85
        'AGR'
86
        >>> pe.encode_alpha('Braaten')
87
        'BRADN'
88
        >>> pe.encode_alpha('Sandvik')
89
        'SANVK'
90
91
92
        .. versionadded:: 0.4.0
93
94
        """
95
        return self.encode(word).replace(self._vowel_char, 'A')
96
97
    def encode(self, word: str) -> str:
98
        """Return the Dolby Code of a name.
99
100
        Parameters
101
        ----------
102
        word : str
103
            The word to transform
104 1
105
        Returns
106 1
        -------
107
        str
108
            The Dolby Code
109
110
        Examples
111
        --------
112
        >>> pe = Dolby()
113
        >>> pe.encode('Hansen')
114
        'H*NSN'
115
        >>> pe.encode('Larsen')
116
        'L*RSN'
117
        >>> pe.encode('Aagaard')
118
        '*GR'
119
        >>> pe.encode('Braaten')
120
        'BR*DN'
121
        >>> pe.encode('Sandvik')
122
        'S*NVK'
123
124
        >>> pe_6 = Dolby(max_length=6)
125
        >>> pe_6.encode('Hansen')
126
        'H*NS*N'
127
        >>> pe_6.encode('Larsen')
128
        'L*RS*N'
129
        >>> pe_6.encode('Aagaard')
130
        '*G*R  '
131
        >>> pe_6.encode('Braaten')
132
        'BR*D*N'
133
        >>> pe_6.encode('Sandvik')
134
        'S*NF*K'
135
136
        >>> pe.encode('Smith')
137
        'SM*D'
138
        >>> pe.encode('Waters')
139
        'W*DRS'
140
        >>> pe.encode('James')
141
        'J*MS'
142
        >>> pe.encode('Schmidt')
143
        'SM*D'
144
        >>> pe.encode('Ashcroft')
145
        '*SKRFD'
146
147
        >>> pe_6.encode('Smith')
148
        'SM*D  '
149
        >>> pe_6.encode('Waters')
150
        'W*D*RS'
151
        >>> pe_6.encode('James')
152
        'J*M*S '
153
        >>> pe_6.encode('Schmidt')
154
        'SM*D  '
155
        >>> pe_6.encode('Ashcroft')
156
        '*SKRFD'
157
158
159
        .. versionadded:: 0.3.0
160
        .. versionchanged:: 0.3.6
161
            Encapsulated in class
162
163
        """
164
        # uppercase, normalize, decompose, and filter non-A-Z out
165
        word = unicode_normalize('NFKD', word.upper())
166
        word = ''.join(c for c in word if c in self._uc_set)
167
168
        # Rule 1 (FL2)
169
        if word[:3] in {'MCG', 'MAG', 'MAC'}:
170
            word = 'MK' + word[3:]
171
        elif word[:2] == 'MC':
172
            word = 'MK' + word[2:]
173
174 1
        # Rule 2 (FL3)
175 1
        pos = len(word) - 2
176 1
        while pos > -1:
177
            if word[pos : pos + 2] in {
178
                'DT',
179 1
                'LD',
180 1
                'ND',
181 1
                'NT',
182 1
                'RC',
183
                'RD',
184
                'RT',
185 1
                'SC',
186 1
                'SK',
187 1
                'ST',
188
            }:
189
                word = word[: pos + 1] + word[pos + 2 :]
190
                pos += 1
191
            pos -= 1
192
193
        # Rule 3 (FL4)
194
        # Although the rule indicates "after the first letter", the test cases
195
        # make it clear that these apply to the first letter also.
196
        word = word.replace('X', 'KS')
197
        word = word.replace('CE', 'SE')
198
        word = word.replace('CI', 'SI')
199 1
        word = word.replace('CY', 'SI')
200 1
201 1
        # not in the rule set, but they seem to have intended it
202
        word = word.replace('TCH', 'CH')
203
204
        pos = word.find('CH', 1)
205
        while pos != -1:
206 1
            if word[pos - 1 : pos] not in self._uc_vy_set:
207 1
                word = word[:pos] + 'S' + word[pos + 1 :]
208 1
            pos = word.find('CH', pos + 1)
209 1
210
        word = word.replace('C', 'K')
211
        word = word.replace('Z', 'S')
212 1
213
        word = word.replace('WR', 'R')
214 1
        word = word.replace('DG', 'G')
215 1
        word = word.replace('QU', 'K')
216 1
        word = word.replace('T', 'D')
217 1
        word = word.replace('PH', 'F')
218 1
219
        # Rule 4 (FL5)
220 1
        # Although the rule indicates "after the first letter", the test cases
221 1
        # make it clear that these apply to the first letter also.
222
        pos = word.find('K', 0)
223 1
        while pos != -1:
224 1
            if pos > 1 and word[pos - 1 : pos] not in self._uc_vy_set | {
225 1
                'L',
226 1
                'N',
227 1
                'R',
228
            }:
229
                word = word[: pos - 1] + word[pos:]
230
                pos -= 1
231
            pos = word.find('K', pos + 1)
232 1
233 1
        # Rule FL6
234 1
        if self._max_length > 0 and word[-1:] == 'E':
235
            word = word[:-1]
236
237
        # Rule 5 (FL7)
238
        word = self._delete_consecutive_repeats(word)
239 1
240 1
        # Rule 6 (FL8)
241 1
        if word[:2] == 'PF':
242
            word = word[1:]
243
        if word[-2:] == 'PF':
244 1
            word = word[:-1]
245 1
        elif word[-2:] == 'GH':
246
            if word[-3:-2] in self._uc_vy_set:
247
                word = word[:-2] + 'F'
248 1
            else:
249
                word = word[:-2] + 'G'
250
        word = word.replace('GH', '')
251 1
252 1
        # Rule FL9
253 1
        if self._max_length > 0:
254 1
            word = word.replace('V', 'F')
255 1
256 1
        # Rules 7-9 (FL10-FL12)
257 1
        first = 1 + (1 if self._max_length > 0 else 0)
258
        code = ''
259 1
        for pos, char in enumerate(word):
260 1
            if char in self._uc_vy_set:
261
                if first or self._keep_vowels:
262
                    code += self._vowel_char
263 1
                    first -= 1
264 1
            elif pos > 0 and char in {'W', 'H'}:
265
                continue
266
            else:
267 1
                code += char
268 1
269 1
        if self._max_length > 0:
270 1
            # Rule FL13
271 1
            if len(code) > self._max_length and code[-1:] == 'S':
272 1
                code = code[:-1]
273 1
            if self._keep_vowels:
274 1
                code = code[: self._max_length]
275 1
            else:
276
                # Rule FL14
277 1
                code = code[: self._max_length + 2]
278
                # Rule FL15
279 1
                while len(code) > self._max_length:
280
                    vowels = len(code) - self._max_length
281 1
                    excess = vowels - 1
282 1
                    word = code
283 1
                    code = ''
284 1
                    for char in word:
285
                        if char == self._vowel_char:
286
                            if vowels:
287 1
                                code += char
288
                                vowels -= 1
289 1
                        else:
290 1
                            code += char
291 1
                    code = code[: self._max_length + excess]
292 1
293 1
            # Rule FL16
294 1
            code += ' ' * (self._max_length - len(code))
295 1
296 1
        return code
297 1
298 1
299
if __name__ == '__main__':
300 1
    import doctest
301 1
302
    doctest.testmod()
303