Test Failed
Push — master ( 64abe2...a464fa )
by Chris
04:02 queued 11s
created

abydos.phonetic.sv.sfinxbis()   F

Complexity

Conditions 32

Size

Total Lines 174
Code Lines 103

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 32
eloc 103
nop 2
dl 0
loc 174
rs 0
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like abydos.phonetic.sv.sfinxbis() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.phonetic.sv.
20
21
The phonetic.sv module implements phonetic algorithms for Scandinavian names
22
& languages (currently Swedish & Norwegian), including:
23
24
    - SfinxBis
25
    - Norphone
26
"""
27
28
from __future__ import unicode_literals
29
30
from unicodedata import normalize as unicode_normalize
31
32
from six import text_type
33
34
from . import _delete_consecutive_repeats
35
36
__all__ = ['norphone', 'sfinxbis']
37
38
39
def sfinxbis(word, max_length=-1):
40
    """Return the SfinxBis code for a word.
41
42
    SfinxBis is a Soundex-like algorithm defined in :cite:`Axelsson:2009`.
43
44
    This implementation follows the reference implementation:
45
    :cite:`Sjoo:2009`.
46
47
    SfinxBis is intended chiefly for Swedish names.
48
49
    :param str word: the word to transform
50
    :param int max_length: the length of the code returned (defaults to
51
        unlimited)
52
    :returns: the SfinxBis value
53
    :rtype: tuple
54
55
    >>> sfinxbis('Christopher')
56
    ('K68376',)
57
    >>> sfinxbis('Niall')
58
    ('N4',)
59
    >>> sfinxbis('Smith')
60
    ('S53',)
61
    >>> sfinxbis('Schmidt')
62
    ('S53',)
63
64
    >>> sfinxbis('Johansson')
65
    ('J585',)
66
    >>> sfinxbis('Sjöberg')
67
    ('#162',)
68
    """
69
    adelstitler = (' DE LA ', ' DE LAS ', ' DE LOS ', ' VAN DE ', ' VAN DEN ',
70
                   ' VAN DER ', ' VON DEM ', ' VON DER ',
71
                   ' AF ', ' AV ', ' DA ', ' DE ', ' DEL ', ' DEN ', ' DES ',
72
                   ' DI ', ' DO ', ' DON ', ' DOS ', ' DU ', ' E ', ' IN ',
73
                   ' LA ', ' LE ', ' MAC ', ' MC ', ' VAN ', ' VON ', ' Y ',
74
                   ' S:T ')
75
76
    _harde_vokaler = {'A', 'O', 'U', 'Å'}
77
    _mjuka_vokaler = {'E', 'I', 'Y', 'Ä', 'Ö'}
78
    _konsonanter = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P',
79
                    'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Z'}
80
    _alfabet = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
81
                'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
82
                'Y', 'Z', 'Ä', 'Å', 'Ö'}
83
84
    _sfinxbis_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
85
                                      'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'),
86
                                     '123729224551268378999999999'))
87
88
    _sfinxbis_substitutions = dict(zip((ord(_) for _ in
89
                                        'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'),
90
                                       'VSAAAAÄCEEEEIIIINOOOOÖUUUYY'))
91
92
    def _foersvensker(lokal_ordet):
93
        """Return the Swedish-ized form of the word."""
94
        lokal_ordet = lokal_ordet.replace('STIERN', 'STJÄRN')
95
        lokal_ordet = lokal_ordet.replace('HIE', 'HJ')
96
        lokal_ordet = lokal_ordet.replace('SIÖ', 'SJÖ')
97
        lokal_ordet = lokal_ordet.replace('SCH', 'SH')
98
        lokal_ordet = lokal_ordet.replace('QU', 'KV')
99
        lokal_ordet = lokal_ordet.replace('IO', 'JO')
100
        lokal_ordet = lokal_ordet.replace('PH', 'F')
101
102
        for i in _harde_vokaler:
103
            lokal_ordet = lokal_ordet.replace(i+'Ü', i+'J')
104
            lokal_ordet = lokal_ordet.replace(i+'Y', i+'J')
105
            lokal_ordet = lokal_ordet.replace(i+'I', i+'J')
106
        for i in _mjuka_vokaler:
107
            lokal_ordet = lokal_ordet.replace(i+'Ü', i+'J')
108
            lokal_ordet = lokal_ordet.replace(i+'Y', i+'J')
109
            lokal_ordet = lokal_ordet.replace(i+'I', i+'J')
110
111
        if 'H' in lokal_ordet:
112
            for i in _konsonanter:
113
                lokal_ordet = lokal_ordet.replace('H'+i, i)
114
115
        lokal_ordet = lokal_ordet.translate(_sfinxbis_substitutions)
116
117
        lokal_ordet = lokal_ordet.replace('Ð', 'ETH')
118
        lokal_ordet = lokal_ordet.replace('Þ', 'TH')
119
        lokal_ordet = lokal_ordet.replace('ß', 'SS')
120
121
        return lokal_ordet
122
123
    def _koda_foersta_ljudet(lokal_ordet):
124
        """Return the word with the first sound coded."""
125
        if (lokal_ordet[0:1] in _mjuka_vokaler or
126
                lokal_ordet[0:1] in _harde_vokaler):
127
            lokal_ordet = '$' + lokal_ordet[1:]
128
        elif lokal_ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'):
129
            lokal_ordet = 'J' + lokal_ordet[2:]
130
        elif lokal_ordet[0:1] == 'G' and lokal_ordet[1:2] in _mjuka_vokaler:
131
            lokal_ordet = 'J' + lokal_ordet[1:]
132
        elif lokal_ordet[0:1] == 'Q':
133
            lokal_ordet = 'K' + lokal_ordet[1:]
134
        elif (lokal_ordet[0:2] == 'CH' and
135
              lokal_ordet[2:3] in frozenset(_mjuka_vokaler | _harde_vokaler)):
136
            lokal_ordet = '#' + lokal_ordet[2:]
137
        elif lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in _harde_vokaler:
138
            lokal_ordet = 'K' + lokal_ordet[1:]
139
        elif lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in _konsonanter:
140
            lokal_ordet = 'K' + lokal_ordet[1:]
141
        elif lokal_ordet[0:1] == 'X':
142
            lokal_ordet = 'S' + lokal_ordet[1:]
143
        elif lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in _mjuka_vokaler:
144
            lokal_ordet = 'S' + lokal_ordet[1:]
145
        elif lokal_ordet[0:3] in ('SKJ', 'STJ', 'SCH'):
146
            lokal_ordet = '#' + lokal_ordet[3:]
147
        elif lokal_ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'):
148
            lokal_ordet = '#' + lokal_ordet[2:]
149
        elif lokal_ordet[0:2] == 'SK' and lokal_ordet[2:3] in _mjuka_vokaler:
150
            lokal_ordet = '#' + lokal_ordet[2:]
151
        elif lokal_ordet[0:1] == 'K' and lokal_ordet[1:2] in _mjuka_vokaler:
152
            lokal_ordet = '#' + lokal_ordet[1:]
153
        return lokal_ordet
154
155
    # Steg 1, Versaler
156
    word = unicode_normalize('NFC', text_type(word.upper()))
157
    word = word.replace('ß', 'SS')
158
    word = word.replace('-', ' ')
159
160
    # Steg 2, Ta bort adelsprefix
161
    for adelstitel in adelstitler:
162
        while adelstitel in word:
163
            word = word.replace(adelstitel, ' ')
164
        if word.startswith(adelstitel[1:]):
165
            word = word[len(adelstitel)-1:]
166
167
    # Split word into tokens
168
    ordlista = word.split()
169
170
    # Steg 3, Ta bort dubbelteckning i början på namnet
171
    ordlista = [_delete_consecutive_repeats(ordet) for ordet in ordlista]
172
    if not ordlista:
173
        # noinspection PyRedundantParentheses
174
        return ('',)
175
176
    # Steg 4, Försvenskning
177
    ordlista = [_foersvensker(ordet) for ordet in ordlista]
178
179
    # Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214)
180
    ordlista = [''.join(c for c in ordet if c in _alfabet)
181
                for ordet in ordlista]
182
183
    # Steg 6, Koda första ljudet
184
    ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista]
185
186
    # Steg 7, Dela upp namnet i två delar
187
    rest = [ordet[1:] for ordet in ordlista]
188
189
    # Steg 8, Utför fonetisk transformation i resten
190
    rest = [ordet.replace('DT', 'T') for ordet in rest]
191
    rest = [ordet.replace('X', 'KS') for ordet in rest]
192
193
    # Steg 9, Koda resten till en sifferkod
194
    for vokal in _mjuka_vokaler:
195
        rest = [ordet.replace('C'+vokal, '8'+vokal) for ordet in rest]
196
    rest = [ordet.translate(_sfinxbis_translation) for ordet in rest]
197
198
    # Steg 10, Ta bort intilliggande dubbletter
199
    rest = [_delete_consecutive_repeats(ordet) for ordet in rest]
200
201
    # Steg 11, Ta bort alla "9"
202
    rest = [ordet.replace('9', '') for ordet in rest]
203
204
    # Steg 12, Sätt ihop delarna igen
205
    ordlista = [''.join(ordet) for ordet in
206
                zip((_[0:1] for _ in ordlista), rest)]
207
208
    # truncate, if max_length is set
209
    if max_length > 0:
210
        ordlista = [ordet[:max_length] for ordet in ordlista]
211
212
    return tuple(ordlista)
213
214
215
def norphone(word):
216
    """Return the Norphone code.
217
218
    The reference implementation by Lars Marius Garshol is available in
219
    :cite:`Garshol:2015`.
220
221
    Norphone was designed for Norwegian, but this implementation has been
222
    extended to support Swedish vowels as well. This function incorporates
223
    the "not implemented" rules from the above file's rule set.
224
225
    :param str word: the word to transform
226
    :returns: the Norphone code
227
    :rtype: str
228
229
    >>> norphone('Hansen')
230
    'HNSN'
231
    >>> norphone('Larsen')
232
    'LRSN'
233
    >>> norphone('Aagaard')
234
    'ÅKRT'
235
    >>> norphone('Braaten')
236
    'BRTN'
237
    >>> norphone('Sandvik')
238
    'SNVK'
239
    """
240
    _vowels = {'A', 'E', 'I', 'O', 'U', 'Y', 'Å', 'Æ', 'Ø', 'Ä', 'Ö'}
241
242
    replacements = {4: {'SKEI': 'X'},
243
                    3: {'SKJ': 'X', 'KEI': 'X'},
244
                    2: {'CH': 'K', 'CK': 'K', 'GJ': 'J', 'GH': 'K', 'HG': 'K',
245
                        'HJ': 'J', 'HL': 'L', 'HR': 'R', 'KJ': 'X', 'KI': 'X',
246
                        'LD': 'L', 'ND': 'N', 'PH': 'F', 'TH': 'T', 'SJ': 'X'},
247
                    1: {'W': 'V', 'X': 'KS', 'Z': 'S', 'D': 'T', 'G': 'K'}}
248
249
    word = word.upper()
250
251
    code = ''
252
    skip = 0
253
254
    if word[0:2] == 'AA':
255
        code = 'Å'
256
        skip = 2
257
    elif word[0:2] == 'GI':
258
        code = 'J'
259
        skip = 2
260
    elif word[0:3] == 'SKY':
261
        code = 'X'
262
        skip = 3
263
    elif word[0:2] == 'EI':
264
        code = 'Æ'
265
        skip = 2
266
    elif word[0:2] == 'KY':
267
        code = 'X'
268
        skip = 2
269
    elif word[:1] == 'C':
270
        code = 'K'
271
        skip = 1
272
    elif word[:1] == 'Ä':
273
        code = 'Æ'
274
        skip = 1
275
    elif word[:1] == 'Ö':
276
        code = 'Ø'
277
        skip = 1
278
279
    if word[-2:] == 'DT':
280
        word = word[:-2]+'T'
281
    # Though the rules indicate this rule applies in all positions, the
282
    # reference implementation indicates it applies only in final position.
283
    elif word[-2:-1] in _vowels and word[-1:] == 'D':
284
        word = word[:-2]
285
286
    for pos, char in enumerate(word):
287
        if skip:
288
            skip -= 1
289
        else:
290
            for length in sorted(replacements, reverse=True):
291
                if word[pos:pos+length] in replacements[length]:
292
                    code += replacements[length][word[pos:pos+length]]
293
                    skip = length-1
294
                    break
295
            else:
296
                if not pos or char not in _vowels:
297
                    code += char
298
299
    code = _delete_consecutive_repeats(code)
300
301
    return code
302
303
304
if __name__ == '__main__':
305
    import doctest
306
    doctest.testmod()
307