Completed
Pull Request — master (#141)
by Chris
11:42
created

abydos.phonetic._SfinxBis   A

Complexity

Total Complexity 33

Size/Duplication

Total Lines 383
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
eloc 206
dl 0
loc 383
ccs 96
cts 96
cp 1
rs 9.76
c 0
b 0
f 0
wmc 33

1 Function

Rating   Name   Duplication   Size   Complexity  
A sfinxbis() 0 30 1

1 Method

Rating   Name   Duplication   Size   Complexity  
F SfinxBis.encode() 0 191 32
1
# -*- coding: utf-8 -*-
0 ignored issues
show
Coding Style Naming introduced by
The name _SfinxBis does not conform to the module naming conventions ((([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$).

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._SfinxBis.
20
21
SfinxBis
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from unicodedata import normalize as unicode_normalize
32
33 1
from six import text_type
34
35 1
from ._Phonetic import Phonetic
36
37 1
__all__ = ['SfinxBis', 'sfinxbis']
38
39
40 1
class SfinxBis(Phonetic):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
41
    """SfinxBis code.
42
43
    SfinxBis is a Soundex-like algorithm defined in :cite:`Axelsson:2009`.
44
45
    This implementation follows the reference implementation:
46
    :cite:`Sjoo:2009`.
47
48
    SfinxBis is intended chiefly for Swedish names.
49
    """
50
51 1
    _adelstitler = (
52
        ' DE LA ',
53
        ' DE LAS ',
54
        ' DE LOS ',
55
        ' VAN DE ',
56
        ' VAN DEN ',
57
        ' VAN DER ',
58
        ' VON DEM ',
59
        ' VON DER ',
60
        ' AF ',
61
        ' AV ',
62
        ' DA ',
63
        ' DE ',
64
        ' DEL ',
65
        ' DEN ',
66
        ' DES ',
67
        ' DI ',
68
        ' DO ',
69
        ' DON ',
70
        ' DOS ',
71
        ' DU ',
72
        ' E ',
73
        ' IN ',
74
        ' LA ',
75
        ' LE ',
76
        ' MAC ',
77
        ' MC ',
78
        ' VAN ',
79
        ' VON ',
80
        ' Y ',
81
        ' S:T ',
82
    )
83
84 1
    _harde_vokaler = {'A', 'O', 'U', 'Å'}
85 1
    _mjuka_vokaler = {'E', 'I', 'Y', 'Ä', 'Ö'}
86 1
    _uc_c_set = {
87
        'B',
88
        'C',
89
        'D',
90
        'F',
91
        'G',
92
        'H',
93
        'J',
94
        'K',
95
        'L',
96
        'M',
97
        'N',
98
        'P',
99
        'Q',
100
        'R',
101
        'S',
102
        'T',
103
        'V',
104
        'W',
105
        'X',
106
        'Z',
107
    }
108 1
    _uc_set = {
109
        'A',
110
        'B',
111
        'C',
112
        'D',
113
        'E',
114
        'F',
115
        'G',
116
        'H',
117
        'I',
118
        'J',
119
        'K',
120
        'L',
121
        'M',
122
        'N',
123
        'O',
124
        'P',
125
        'Q',
126
        'R',
127
        'S',
128
        'T',
129
        'U',
130
        'V',
131
        'W',
132
        'X',
133
        'Y',
134
        'Z',
135
        'Ä',
136
        'Å',
137
        'Ö',
138
    }
139
140 1
    _trans = dict(
141
        zip(
142
            (ord(_) for _ in 'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
143
            '123729224551268378999999999',
144
        )
145
    )
146
147 1
    _substitutions = dict(
148
        zip(
149
            (ord(_) for _ in 'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'),
150
            'VSAAAAÄCEEEEIIIINOOOOÖUUUYY',
151
        )
152
    )
153
154 1
    def encode(self, word, max_length=-1):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'encode' method
Loading history...
155
        """Return the SfinxBis code for a word.
156
157
        Args:
158
            word (str): The word to transform
159
            max_length (int): The length of the code returned (defaults to
160
                unlimited)
161
162
        Returns:
163
            tuple: The SfinxBis value
164
165
        Examples:
166
            >>> pe = SfinxBis()
167
            >>> pe.encode('Christopher')
168
            ('K68376',)
169
            >>> pe.encode('Niall')
170
            ('N4',)
171
            >>> pe.encode('Smith')
172
            ('S53',)
173
            >>> pe.encode('Schmidt')
174
            ('S53',)
175
176
            >>> pe.encode('Johansson')
177
            ('J585',)
178
            >>> pe.encode('Sjöberg')
179
            ('#162',)
180
181
        """
182
183 1
        def _foersvensker(lokal_ordet):
184
            """Return the Swedish-ized form of the word.
185
186
            Args:
187
                lokal_ordet (str): Word to transform
188
189
            Returns:
190
                str: Transformed word
191
192
            """
193 1
            lokal_ordet = lokal_ordet.replace('STIERN', 'STJÄRN')
194 1
            lokal_ordet = lokal_ordet.replace('HIE', 'HJ')
195 1
            lokal_ordet = lokal_ordet.replace('SIÖ', 'SJÖ')
196 1
            lokal_ordet = lokal_ordet.replace('SCH', 'SH')
197 1
            lokal_ordet = lokal_ordet.replace('QU', 'KV')
198 1
            lokal_ordet = lokal_ordet.replace('IO', 'JO')
199 1
            lokal_ordet = lokal_ordet.replace('PH', 'F')
200
201 1
            for i in self._harde_vokaler:
202 1
                lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
203 1
                lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
204 1
                lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')
205 1
            for i in self._mjuka_vokaler:
206 1
                lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
207 1
                lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
208 1
                lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')
209
210 1
            if 'H' in lokal_ordet:
211 1
                for i in self._uc_c_set:
212 1
                    lokal_ordet = lokal_ordet.replace('H' + i, i)
213
214 1
            lokal_ordet = lokal_ordet.translate(self._substitutions)
215
216 1
            lokal_ordet = lokal_ordet.replace('Ð', 'ETH')
217 1
            lokal_ordet = lokal_ordet.replace('Þ', 'TH')
218 1
            lokal_ordet = lokal_ordet.replace('ß', 'SS')
219
220 1
            return lokal_ordet
221
222 1
        def _koda_foersta_ljudet(lokal_ordet):
223
            """Return the word with the first sound coded.
224
225
            Args:
226
                lokal_ordet (str): Word to transform
227
228
            Returns:
229
                str: Transformed word
230
231
            """
232 1
            if (
233
                lokal_ordet[0:1] in self._mjuka_vokaler
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
234
                or lokal_ordet[0:1] in self._harde_vokaler
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
235
            ):
236 1
                lokal_ordet = '$' + lokal_ordet[1:]
237 1
            elif lokal_ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'):
238 1
                lokal_ordet = 'J' + lokal_ordet[2:]
239 1
            elif (
240
                lokal_ordet[0:1] == 'G'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
241
                and lokal_ordet[1:2] in self._mjuka_vokaler
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
242
            ):
243 1
                lokal_ordet = 'J' + lokal_ordet[1:]
244 1
            elif lokal_ordet[0:1] == 'Q':
245 1
                lokal_ordet = 'K' + lokal_ordet[1:]
246 1
            elif lokal_ordet[0:2] == 'CH' and lokal_ordet[2:3] in frozenset(
247
                self._mjuka_vokaler | self._harde_vokaler
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
248
            ):
249 1
                lokal_ordet = '#' + lokal_ordet[2:]
250 1
            elif (
251
                lokal_ordet[0:1] == 'C'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
252
                and lokal_ordet[1:2] in self._harde_vokaler
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
253
            ):
254 1
                lokal_ordet = 'K' + lokal_ordet[1:]
255 1
            elif (
256
                lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in self._uc_c_set
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
257
            ):
258 1
                lokal_ordet = 'K' + lokal_ordet[1:]
259 1
            elif lokal_ordet[0:1] == 'X':
260 1
                lokal_ordet = 'S' + lokal_ordet[1:]
261 1
            elif (
262
                lokal_ordet[0:1] == 'C'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
263
                and lokal_ordet[1:2] in self._mjuka_vokaler
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
264
            ):
265 1
                lokal_ordet = 'S' + lokal_ordet[1:]
266 1
            elif lokal_ordet[0:3] in ('SKJ', 'STJ', 'SCH'):
267 1
                lokal_ordet = '#' + lokal_ordet[3:]
268 1
            elif lokal_ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'):
269 1
                lokal_ordet = '#' + lokal_ordet[2:]
270 1
            elif (
271
                lokal_ordet[0:2] == 'SK'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
272
                and lokal_ordet[2:3] in self._mjuka_vokaler
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
273
            ):
274 1
                lokal_ordet = '#' + lokal_ordet[2:]
275 1
            elif (
276
                lokal_ordet[0:1] == 'K'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
277
                and lokal_ordet[1:2] in self._mjuka_vokaler
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
278
            ):
279 1
                lokal_ordet = '#' + lokal_ordet[1:]
280 1
            return lokal_ordet
281
282
        # Steg 1, Versaler
283 1
        word = unicode_normalize('NFC', text_type(word.upper()))
284 1
        word = word.replace('ß', 'SS')
285 1
        word = word.replace('-', ' ')
286
287
        # Steg 2, Ta bort adelsprefix
288 1
        for adelstitel in self._adelstitler:
289 1
            while adelstitel in word:
290 1
                word = word.replace(adelstitel, ' ')
291 1
            if word.startswith(adelstitel[1:]):
292 1
                word = word[len(adelstitel) - 1 :]
293
294
        # Split word into tokens
295 1
        ordlista = word.split()
296
297
        # Steg 3, Ta bort dubbelteckning i början på namnet
298 1
        ordlista = [
299
            self._delete_consecutive_repeats(ordet) for ordet in ordlista
300
        ]
301 1
        if not ordlista:
302
            # noinspection PyRedundantParentheses
303 1
            return ('',)
304
305
        # Steg 4, Försvenskning
306 1
        ordlista = [_foersvensker(ordet) for ordet in ordlista]
307
308
        # Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214)
309 1
        ordlista = [
310
            ''.join(c for c in ordet if c in self._uc_set)
311
            for ordet in ordlista
312
        ]
313
314
        # Steg 6, Koda första ljudet
315 1
        ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista]
316
317
        # Steg 7, Dela upp namnet i två delar
318 1
        rest = [ordet[1:] for ordet in ordlista]
319
320
        # Steg 8, Utför fonetisk transformation i resten
321 1
        rest = [ordet.replace('DT', 'T') for ordet in rest]
322 1
        rest = [ordet.replace('X', 'KS') for ordet in rest]
323
324
        # Steg 9, Koda resten till en sifferkod
325 1
        for vokal in self._mjuka_vokaler:
326 1
            rest = [ordet.replace('C' + vokal, '8' + vokal) for ordet in rest]
327 1
        rest = [ordet.translate(self._trans) for ordet in rest]
328
329
        # Steg 10, Ta bort intilliggande dubbletter
330 1
        rest = [self._delete_consecutive_repeats(ordet) for ordet in rest]
331
332
        # Steg 11, Ta bort alla "9"
333 1
        rest = [ordet.replace('9', '') for ordet in rest]
334
335
        # Steg 12, Sätt ihop delarna igen
336 1
        ordlista = [
337
            ''.join(ordet) for ordet in zip((_[0:1] for _ in ordlista), rest)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
338
        ]
339
340
        # truncate, if max_length is set
341 1
        if max_length > 0:
342 1
            ordlista = [ordet[:max_length] for ordet in ordlista]
343
344 1
        return tuple(ordlista)
345
346
347 1
def sfinxbis(word, max_length=-1):
348
    """Return the SfinxBis code for a word.
349
350
    This is a wrapper for :py:meth:`SfinxBis.encode`.
351
352
    Args:
353
        word (str): The word to transform
354
        max_length (int): The length of the code returned (defaults to
355
            unlimited)
356
357
    Returns:
358
        tuple: The SfinxBis value
359
360
    Examples:
361
        >>> sfinxbis('Christopher')
362
        ('K68376',)
363
        >>> sfinxbis('Niall')
364
        ('N4',)
365
        >>> sfinxbis('Smith')
366
        ('S53',)
367
        >>> sfinxbis('Schmidt')
368
        ('S53',)
369
370
        >>> sfinxbis('Johansson')
371
        ('J585',)
372
        >>> sfinxbis('Sjöberg')
373
        ('#162',)
374
375
    """
376 1
    return SfinxBis().encode(word, max_length)
377
378
379
if __name__ == '__main__':
380
    import doctest
381
382
    doctest.testmod()
383