Completed
Branch master (78a222)
by Chris
14:36
created

abydos.phonetic._sv.sfinxbis()   F

Complexity

Conditions 32

Size

Total Lines 259
Code Lines 177

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 87
CRAP Score 32

Importance

Changes 0
Metric Value
eloc 177
dl 0
loc 259
ccs 87
cts 87
cp 1
rs 0
c 0
b 0
f 0
cc 32
nop 2
crap 32

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like abydos.phonetic._sv.sfinxbis() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._sv.
20
21
The phonetic._sv module implements phonetic algorithms for Scandinavian names
22
& languages (currently Swedish & Norwegian), including:
23
24
    - SfinxBis
25
    - Norphone
26
"""
27
28 1
from __future__ import unicode_literals
29
30 1
from unicodedata import normalize as unicode_normalize
31
32 1
from six import text_type
33
34 1
from ._util import _delete_consecutive_repeats
35
36 1
__all__ = ['norphone', 'sfinxbis']
37
38
39 1
def sfinxbis(word, max_length=-1):
40
    """Return the SfinxBis code for a word.
41
42
    SfinxBis is a Soundex-like algorithm defined in :cite:`Axelsson:2009`.
43
44
    This implementation follows the reference implementation:
45
    :cite:`Sjoo:2009`.
46
47
    SfinxBis is intended chiefly for Swedish names.
48
49
    :param str word: the word to transform
50
    :param int max_length: the length of the code returned (defaults to
51
        unlimited)
52
    :returns: the SfinxBis value
53
    :rtype: tuple
54
55
    >>> sfinxbis('Christopher')
56
    ('K68376',)
57
    >>> sfinxbis('Niall')
58
    ('N4',)
59
    >>> sfinxbis('Smith')
60
    ('S53',)
61
    >>> sfinxbis('Schmidt')
62
    ('S53',)
63
64
    >>> sfinxbis('Johansson')
65
    ('J585',)
66
    >>> sfinxbis('Sjöberg')
67
    ('#162',)
68
    """
69 1
    adelstitler = (
70
        ' DE LA ',
71
        ' DE LAS ',
72
        ' DE LOS ',
73
        ' VAN DE ',
74
        ' VAN DEN ',
75
        ' VAN DER ',
76
        ' VON DEM ',
77
        ' VON DER ',
78
        ' AF ',
79
        ' AV ',
80
        ' DA ',
81
        ' DE ',
82
        ' DEL ',
83
        ' DEN ',
84
        ' DES ',
85
        ' DI ',
86
        ' DO ',
87
        ' DON ',
88
        ' DOS ',
89
        ' DU ',
90
        ' E ',
91
        ' IN ',
92
        ' LA ',
93
        ' LE ',
94
        ' MAC ',
95
        ' MC ',
96
        ' VAN ',
97
        ' VON ',
98
        ' Y ',
99
        ' S:T ',
100
    )
101
102 1
    _harde_vokaler = {'A', 'O', 'U', 'Å'}
103 1
    _mjuka_vokaler = {'E', 'I', 'Y', 'Ä', 'Ö'}
104 1
    _konsonanter = {
105
        'B',
106
        'C',
107
        'D',
108
        'F',
109
        'G',
110
        'H',
111
        'J',
112
        'K',
113
        'L',
114
        'M',
115
        'N',
116
        'P',
117
        'Q',
118
        'R',
119
        'S',
120
        'T',
121
        'V',
122
        'W',
123
        'X',
124
        'Z',
125
    }
126 1
    _alfabet = {
127
        'A',
128
        'B',
129
        'C',
130
        'D',
131
        'E',
132
        'F',
133
        'G',
134
        'H',
135
        'I',
136
        'J',
137
        'K',
138
        'L',
139
        'M',
140
        'N',
141
        'O',
142
        'P',
143
        'Q',
144
        'R',
145
        'S',
146
        'T',
147
        'U',
148
        'V',
149
        'W',
150
        'X',
151
        'Y',
152
        'Z',
153
        'Ä',
154
        'Å',
155
        'Ö',
156
    }
157
158 1
    _sfinxbis_translation = dict(
159
        zip(
160
            (ord(_) for _ in 'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
161
            '123729224551268378999999999',
162
        )
163
    )
164
165 1
    _sfinxbis_substitutions = dict(
166
        zip(
167
            (ord(_) for _ in 'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'),
168
            'VSAAAAÄCEEEEIIIINOOOOÖUUUYY',
169
        )
170
    )
171
172 1
    def _foersvensker(lokal_ordet):
173
        """Return the Swedish-ized form of the word."""
174 1
        lokal_ordet = lokal_ordet.replace('STIERN', 'STJÄRN')
175 1
        lokal_ordet = lokal_ordet.replace('HIE', 'HJ')
176 1
        lokal_ordet = lokal_ordet.replace('SIÖ', 'SJÖ')
177 1
        lokal_ordet = lokal_ordet.replace('SCH', 'SH')
178 1
        lokal_ordet = lokal_ordet.replace('QU', 'KV')
179 1
        lokal_ordet = lokal_ordet.replace('IO', 'JO')
180 1
        lokal_ordet = lokal_ordet.replace('PH', 'F')
181
182 1
        for i in _harde_vokaler:
183 1
            lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
184 1
            lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
185 1
            lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')
186 1
        for i in _mjuka_vokaler:
187 1
            lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
188 1
            lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
189 1
            lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')
190
191 1
        if 'H' in lokal_ordet:
192 1
            for i in _konsonanter:
193 1
                lokal_ordet = lokal_ordet.replace('H' + i, i)
194
195 1
        lokal_ordet = lokal_ordet.translate(_sfinxbis_substitutions)
196
197 1
        lokal_ordet = lokal_ordet.replace('Ð', 'ETH')
198 1
        lokal_ordet = lokal_ordet.replace('Þ', 'TH')
199 1
        lokal_ordet = lokal_ordet.replace('ß', 'SS')
200
201 1
        return lokal_ordet
202
203 1
    def _koda_foersta_ljudet(lokal_ordet):
204
        """Return the word with the first sound coded."""
205 1
        if (
206
            lokal_ordet[0:1] in _mjuka_vokaler
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
207
            or lokal_ordet[0:1] in _harde_vokaler
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
208
        ):
209 1
            lokal_ordet = '$' + lokal_ordet[1:]
210 1
        elif lokal_ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'):
211 1
            lokal_ordet = 'J' + lokal_ordet[2:]
212 1
        elif lokal_ordet[0:1] == 'G' and lokal_ordet[1:2] in _mjuka_vokaler:
213 1
            lokal_ordet = 'J' + lokal_ordet[1:]
214 1
        elif lokal_ordet[0:1] == 'Q':
215 1
            lokal_ordet = 'K' + lokal_ordet[1:]
216 1
        elif lokal_ordet[0:2] == 'CH' and lokal_ordet[2:3] in frozenset(
217
            _mjuka_vokaler | _harde_vokaler
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
218
        ):
219 1
            lokal_ordet = '#' + lokal_ordet[2:]
220 1
        elif lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in _harde_vokaler:
221 1
            lokal_ordet = 'K' + lokal_ordet[1:]
222 1
        elif lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in _konsonanter:
223 1
            lokal_ordet = 'K' + lokal_ordet[1:]
224 1
        elif lokal_ordet[0:1] == 'X':
225 1
            lokal_ordet = 'S' + lokal_ordet[1:]
226 1
        elif lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in _mjuka_vokaler:
227 1
            lokal_ordet = 'S' + lokal_ordet[1:]
228 1
        elif lokal_ordet[0:3] in ('SKJ', 'STJ', 'SCH'):
229 1
            lokal_ordet = '#' + lokal_ordet[3:]
230 1
        elif lokal_ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'):
231 1
            lokal_ordet = '#' + lokal_ordet[2:]
232 1
        elif lokal_ordet[0:2] == 'SK' and lokal_ordet[2:3] in _mjuka_vokaler:
233 1
            lokal_ordet = '#' + lokal_ordet[2:]
234 1
        elif lokal_ordet[0:1] == 'K' and lokal_ordet[1:2] in _mjuka_vokaler:
235 1
            lokal_ordet = '#' + lokal_ordet[1:]
236 1
        return lokal_ordet
237
238
    # Steg 1, Versaler
239 1
    word = unicode_normalize('NFC', text_type(word.upper()))
240 1
    word = word.replace('ß', 'SS')
241 1
    word = word.replace('-', ' ')
242
243
    # Steg 2, Ta bort adelsprefix
244 1
    for adelstitel in adelstitler:
245 1
        while adelstitel in word:
246 1
            word = word.replace(adelstitel, ' ')
247 1
        if word.startswith(adelstitel[1:]):
248 1
            word = word[len(adelstitel) - 1 :]
249
250
    # Split word into tokens
251 1
    ordlista = word.split()
252
253
    # Steg 3, Ta bort dubbelteckning i början på namnet
254 1
    ordlista = [_delete_consecutive_repeats(ordet) for ordet in ordlista]
255 1
    if not ordlista:
256
        # noinspection PyRedundantParentheses
257 1
        return ('',)
258
259
    # Steg 4, Försvenskning
260 1
    ordlista = [_foersvensker(ordet) for ordet in ordlista]
261
262
    # Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214)
263 1
    ordlista = [
264
        ''.join(c for c in ordet if c in _alfabet) for ordet in ordlista
265
    ]
266
267
    # Steg 6, Koda första ljudet
268 1
    ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista]
269
270
    # Steg 7, Dela upp namnet i två delar
271 1
    rest = [ordet[1:] for ordet in ordlista]
272
273
    # Steg 8, Utför fonetisk transformation i resten
274 1
    rest = [ordet.replace('DT', 'T') for ordet in rest]
275 1
    rest = [ordet.replace('X', 'KS') for ordet in rest]
276
277
    # Steg 9, Koda resten till en sifferkod
278 1
    for vokal in _mjuka_vokaler:
279 1
        rest = [ordet.replace('C' + vokal, '8' + vokal) for ordet in rest]
280 1
    rest = [ordet.translate(_sfinxbis_translation) for ordet in rest]
281
282
    # Steg 10, Ta bort intilliggande dubbletter
283 1
    rest = [_delete_consecutive_repeats(ordet) for ordet in rest]
284
285
    # Steg 11, Ta bort alla "9"
286 1
    rest = [ordet.replace('9', '') for ordet in rest]
287
288
    # Steg 12, Sätt ihop delarna igen
289 1
    ordlista = [
290
        ''.join(ordet) for ordet in zip((_[0:1] for _ in ordlista), rest)
291
    ]
292
293
    # truncate, if max_length is set
294 1
    if max_length > 0:
295 1
        ordlista = [ordet[:max_length] for ordet in ordlista]
296
297 1
    return tuple(ordlista)
298
299
300 1
def norphone(word):
301
    """Return the Norphone code.
302
303
    The reference implementation by Lars Marius Garshol is available in
304
    :cite:`Garshol:2015`.
305
306
    Norphone was designed for Norwegian, but this implementation has been
307
    extended to support Swedish vowels as well. This function incorporates
308
    the "not implemented" rules from the above file's rule set.
309
310
    :param str word: the word to transform
311
    :returns: the Norphone code
312
    :rtype: str
313
314
    >>> norphone('Hansen')
315
    'HNSN'
316
    >>> norphone('Larsen')
317
    'LRSN'
318
    >>> norphone('Aagaard')
319
    'ÅKRT'
320
    >>> norphone('Braaten')
321
    'BRTN'
322
    >>> norphone('Sandvik')
323
    'SNVK'
324
    """
325 1
    _vowels = {'A', 'E', 'I', 'O', 'U', 'Y', 'Å', 'Æ', 'Ø', 'Ä', 'Ö'}
326
327 1
    replacements = {
328
        4: {'SKEI': 'X'},
329
        3: {'SKJ': 'X', 'KEI': 'X'},
330
        2: {
331
            'CH': 'K',
332
            'CK': 'K',
333
            'GJ': 'J',
334
            'GH': 'K',
335
            'HG': 'K',
336
            'HJ': 'J',
337
            'HL': 'L',
338
            'HR': 'R',
339
            'KJ': 'X',
340
            'KI': 'X',
341
            'LD': 'L',
342
            'ND': 'N',
343
            'PH': 'F',
344
            'TH': 'T',
345
            'SJ': 'X',
346
        },
347
        1: {'W': 'V', 'X': 'KS', 'Z': 'S', 'D': 'T', 'G': 'K'},
348
    }
349
350 1
    word = word.upper()
351
352 1
    code = ''
353 1
    skip = 0
354
355 1
    if word[0:2] == 'AA':
356 1
        code = 'Å'
357 1
        skip = 2
358 1
    elif word[0:2] == 'GI':
359 1
        code = 'J'
360 1
        skip = 2
361 1
    elif word[0:3] == 'SKY':
362 1
        code = 'X'
363 1
        skip = 3
364 1
    elif word[0:2] == 'EI':
365 1
        code = 'Æ'
366 1
        skip = 2
367 1
    elif word[0:2] == 'KY':
368 1
        code = 'X'
369 1
        skip = 2
370 1
    elif word[:1] == 'C':
371 1
        code = 'K'
372 1
        skip = 1
373 1
    elif word[:1] == 'Ä':
374 1
        code = 'Æ'
375 1
        skip = 1
376 1
    elif word[:1] == 'Ö':
377 1
        code = 'Ø'
378 1
        skip = 1
379
380 1
    if word[-2:] == 'DT':
381 1
        word = word[:-2] + 'T'
382
    # Though the rules indicate this rule applies in all positions, the
383
    # reference implementation indicates it applies only in final position.
384 1
    elif word[-2:-1] in _vowels and word[-1:] == 'D':
385 1
        word = word[:-2]
386
387 1
    for pos, char in enumerate(word):
388 1
        if skip:
389 1
            skip -= 1
390
        else:
391 1
            for length in sorted(replacements, reverse=True):
392 1
                if word[pos : pos + length] in replacements[length]:
393 1
                    code += replacements[length][word[pos : pos + length]]
394 1
                    skip = length - 1
395 1
                    break
396
            else:
397 1
                if not pos or char not in _vowels:
398 1
                    code += char
399
400 1
    code = _delete_consecutive_repeats(code)
401
402 1
    return code
403
404
405
if __name__ == '__main__':
406
    import doctest
407
408
    doctest.testmod()
409