Completed
Push — master ( f43547...71985b )
by Chris
12:00 queued 10s
created

abydos.phonetic._sfinx_bis   A

Complexity

Total Complexity 33

Size/Duplication

Total Lines 401
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
eloc 206
dl 0
loc 401
ccs 96
cts 96
cp 1
rs 9.76
c 0
b 0
f 0
wmc 33

1 Function

Rating   Name   Duplication   Size   Complexity  
A sfinxbis() 0 35 1

1 Method

Rating   Name   Duplication   Size   Complexity  
F SfinxBis.encode() 0 204 32
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._sfinx_bis.
20
21
SfinxBis
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from unicodedata import normalize as unicode_normalize
32
33 1
from six import text_type
34
35 1
from ._phonetic import _Phonetic
36
37 1
__all__ = ['SfinxBis', 'sfinxbis']
38
39
40 1
class SfinxBis(_Phonetic):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
41
    """SfinxBis code.
42
43
    SfinxBis is a Soundex-like algorithm defined in :cite:`Axelsson:2009`.
44
45
    This implementation follows the reference implementation:
46
    :cite:`Sjoo:2009`.
47
48
    SfinxBis is intended chiefly for Swedish names.
49
    """
50
51 1
    _adelstitler = (
52
        ' DE LA ',
53
        ' DE LAS ',
54
        ' DE LOS ',
55
        ' VAN DE ',
56
        ' VAN DEN ',
57
        ' VAN DER ',
58
        ' VON DEM ',
59
        ' VON DER ',
60
        ' AF ',
61
        ' AV ',
62
        ' DA ',
63
        ' DE ',
64
        ' DEL ',
65
        ' DEN ',
66
        ' DES ',
67
        ' DI ',
68
        ' DO ',
69
        ' DON ',
70
        ' DOS ',
71
        ' DU ',
72
        ' E ',
73
        ' IN ',
74
        ' LA ',
75
        ' LE ',
76
        ' MAC ',
77
        ' MC ',
78
        ' VAN ',
79
        ' VON ',
80
        ' Y ',
81
        ' S:T ',
82
    )
83
84 1
    _harde_vokaler = {'A', 'O', 'U', 'Å'}
85 1
    _mjuka_vokaler = {'E', 'I', 'Y', 'Ä', 'Ö'}
86 1
    _uc_c_set = {
87
        'B',
88
        'C',
89
        'D',
90
        'F',
91
        'G',
92
        'H',
93
        'J',
94
        'K',
95
        'L',
96
        'M',
97
        'N',
98
        'P',
99
        'Q',
100
        'R',
101
        'S',
102
        'T',
103
        'V',
104
        'W',
105
        'X',
106
        'Z',
107
    }
108 1
    _uc_set = {
109
        'A',
110
        'B',
111
        'C',
112
        'D',
113
        'E',
114
        'F',
115
        'G',
116
        'H',
117
        'I',
118
        'J',
119
        'K',
120
        'L',
121
        'M',
122
        'N',
123
        'O',
124
        'P',
125
        'Q',
126
        'R',
127
        'S',
128
        'T',
129
        'U',
130
        'V',
131
        'W',
132
        'X',
133
        'Y',
134
        'Z',
135
        'Ä',
136
        'Å',
137
        'Ö',
138
    }
139
140 1
    _trans = dict(
141
        zip(
142
            (ord(_) for _ in 'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
143
            '123729224551268378999999999',
144
        )
145
    )
146
147 1
    _substitutions = dict(
148
        zip(
149
            (ord(_) for _ in 'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'),
150
            'VSAAAAÄCEEEEIIIINOOOOÖUUUYY',
151
        )
152
    )
153
154 1
    def encode(self, word, max_length=-1):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'encode' method
Loading history...
155
        """Return the SfinxBis code for a word.
156
157
        Parameters
158
        ----------
159
        word : str
160
            The word to transform
161
        max_length : int
162
            The length of the code returned (defaults to unlimited)
163
164
        Returns
165
        -------
166
        tuple
167
            The SfinxBis value
168
169
        Examples
170
        --------
171
        >>> pe = SfinxBis()
172
        >>> pe.encode('Christopher')
173
        ('K68376',)
174
        >>> pe.encode('Niall')
175
        ('N4',)
176
        >>> pe.encode('Smith')
177
        ('S53',)
178
        >>> pe.encode('Schmidt')
179
        ('S53',)
180
181
        >>> pe.encode('Johansson')
182
        ('J585',)
183
        >>> pe.encode('Sjöberg')
184
        ('#162',)
185
186
        """
187
188 1
        def _foersvensker(lokal_ordet):
189
            """Return the Swedish-ized form of the word.
190
191
            Parameters
192
            ----------
193
            lokal_ordet : str
194
                Word to transform
195
196
            Returns
197
            -------
198
            str
199
                Transformed word
200
201
            """
202 1
            lokal_ordet = lokal_ordet.replace('STIERN', 'STJÄRN')
203 1
            lokal_ordet = lokal_ordet.replace('HIE', 'HJ')
204 1
            lokal_ordet = lokal_ordet.replace('SIÖ', 'SJÖ')
205 1
            lokal_ordet = lokal_ordet.replace('SCH', 'SH')
206 1
            lokal_ordet = lokal_ordet.replace('QU', 'KV')
207 1
            lokal_ordet = lokal_ordet.replace('IO', 'JO')
208 1
            lokal_ordet = lokal_ordet.replace('PH', 'F')
209
210 1
            for i in self._harde_vokaler:
211 1
                lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
212 1
                lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
213 1
                lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')
214 1
            for i in self._mjuka_vokaler:
215 1
                lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
216 1
                lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
217 1
                lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')
218
219 1
            if 'H' in lokal_ordet:
220 1
                for i in self._uc_c_set:
221 1
                    lokal_ordet = lokal_ordet.replace('H' + i, i)
222
223 1
            lokal_ordet = lokal_ordet.translate(self._substitutions)
224
225 1
            lokal_ordet = lokal_ordet.replace('Ð', 'ETH')
226 1
            lokal_ordet = lokal_ordet.replace('Þ', 'TH')
227 1
            lokal_ordet = lokal_ordet.replace('ß', 'SS')
228
229 1
            return lokal_ordet
230
231 1
        def _koda_foersta_ljudet(lokal_ordet):
232
            """Return the word with the first sound coded.
233
234
            Parameters
235
            ----------
236
            lokal_ordet : str
237
                Word to transform
238
239
            Returns
240
            -------
241
            str
242
                Transformed word
243
244
            """
245 1
            if (
246
                lokal_ordet[0:1] in self._mjuka_vokaler
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
247
                or lokal_ordet[0:1] in self._harde_vokaler
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
248
            ):
249 1
                lokal_ordet = '$' + lokal_ordet[1:]
250 1
            elif lokal_ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'):
251 1
                lokal_ordet = 'J' + lokal_ordet[2:]
252 1
            elif (
253
                lokal_ordet[0:1] == 'G'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
254
                and lokal_ordet[1:2] in self._mjuka_vokaler
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
255
            ):
256 1
                lokal_ordet = 'J' + lokal_ordet[1:]
257 1
            elif lokal_ordet[0:1] == 'Q':
258 1
                lokal_ordet = 'K' + lokal_ordet[1:]
259 1
            elif lokal_ordet[0:2] == 'CH' and lokal_ordet[2:3] in frozenset(
260
                self._mjuka_vokaler | self._harde_vokaler
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
261
            ):
262 1
                lokal_ordet = '#' + lokal_ordet[2:]
263 1
            elif (
264
                lokal_ordet[0:1] == 'C'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
265
                and lokal_ordet[1:2] in self._harde_vokaler
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
266
            ):
267 1
                lokal_ordet = 'K' + lokal_ordet[1:]
268 1
            elif (
269
                lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in self._uc_c_set
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
270
            ):
271 1
                lokal_ordet = 'K' + lokal_ordet[1:]
272 1
            elif lokal_ordet[0:1] == 'X':
273 1
                lokal_ordet = 'S' + lokal_ordet[1:]
274 1
            elif (
275
                lokal_ordet[0:1] == 'C'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
276
                and lokal_ordet[1:2] in self._mjuka_vokaler
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
277
            ):
278 1
                lokal_ordet = 'S' + lokal_ordet[1:]
279 1
            elif lokal_ordet[0:3] in ('SKJ', 'STJ', 'SCH'):
280 1
                lokal_ordet = '#' + lokal_ordet[3:]
281 1
            elif lokal_ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'):
282 1
                lokal_ordet = '#' + lokal_ordet[2:]
283 1
            elif (
284
                lokal_ordet[0:2] == 'SK'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
285
                and lokal_ordet[2:3] in self._mjuka_vokaler
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
286
            ):
287 1
                lokal_ordet = '#' + lokal_ordet[2:]
288 1
            elif (
289
                lokal_ordet[0:1] == 'K'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
290
                and lokal_ordet[1:2] in self._mjuka_vokaler
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
291
            ):
292 1
                lokal_ordet = '#' + lokal_ordet[1:]
293 1
            return lokal_ordet
294
295
        # Steg 1, Versaler
296 1
        word = unicode_normalize('NFC', text_type(word.upper()))
297 1
        word = word.replace('ß', 'SS')
298 1
        word = word.replace('-', ' ')
299
300
        # Steg 2, Ta bort adelsprefix
301 1
        for adelstitel in self._adelstitler:
302 1
            while adelstitel in word:
303 1
                word = word.replace(adelstitel, ' ')
304 1
            if word.startswith(adelstitel[1:]):
305 1
                word = word[len(adelstitel) - 1 :]
306
307
        # Split word into tokens
308 1
        ordlista = word.split()
309
310
        # Steg 3, Ta bort dubbelteckning i början på namnet
311 1
        ordlista = [
312
            self._delete_consecutive_repeats(ordet) for ordet in ordlista
313
        ]
314 1
        if not ordlista:
315
            # noinspection PyRedundantParentheses
316 1
            return ('',)
317
318
        # Steg 4, Försvenskning
319 1
        ordlista = [_foersvensker(ordet) for ordet in ordlista]
320
321
        # Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214)
322 1
        ordlista = [
323
            ''.join(c for c in ordet if c in self._uc_set)
324
            for ordet in ordlista
325
        ]
326
327
        # Steg 6, Koda första ljudet
328 1
        ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista]
329
330
        # Steg 7, Dela upp namnet i två delar
331 1
        rest = [ordet[1:] for ordet in ordlista]
332
333
        # Steg 8, Utför fonetisk transformation i resten
334 1
        rest = [ordet.replace('DT', 'T') for ordet in rest]
335 1
        rest = [ordet.replace('X', 'KS') for ordet in rest]
336
337
        # Steg 9, Koda resten till en sifferkod
338 1
        for vokal in self._mjuka_vokaler:
339 1
            rest = [ordet.replace('C' + vokal, '8' + vokal) for ordet in rest]
340 1
        rest = [ordet.translate(self._trans) for ordet in rest]
341
342
        # Steg 10, Ta bort intilliggande dubbletter
343 1
        rest = [self._delete_consecutive_repeats(ordet) for ordet in rest]
344
345
        # Steg 11, Ta bort alla "9"
346 1
        rest = [ordet.replace('9', '') for ordet in rest]
347
348
        # Steg 12, Sätt ihop delarna igen
349 1
        ordlista = [
350
            ''.join(ordet) for ordet in zip((_[0:1] for _ in ordlista), rest)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
351
        ]
352
353
        # truncate, if max_length is set
354 1
        if max_length > 0:
355 1
            ordlista = [ordet[:max_length] for ordet in ordlista]
356
357 1
        return tuple(ordlista)
358
359
360 1
def sfinxbis(word, max_length=-1):
361
    """Return the SfinxBis code for a word.
362
363
    This is a wrapper for :py:meth:`SfinxBis.encode`.
364
365
    Parameters
366
    ----------
367
    word : str
368
        The word to transform
369
    max_length : int
370
        The length of the code returned (defaults to unlimited)
371
372
    Returns
373
    -------
374
    tuple
375
        The SfinxBis value
376
377
    Examples
378
    --------
379
    >>> sfinxbis('Christopher')
380
    ('K68376',)
381
    >>> sfinxbis('Niall')
382
    ('N4',)
383
    >>> sfinxbis('Smith')
384
    ('S53',)
385
    >>> sfinxbis('Schmidt')
386
    ('S53',)
387
388
    >>> sfinxbis('Johansson')
389
    ('J585',)
390
    >>> sfinxbis('Sjöberg')
391
    ('#162',)
392
393
    """
394 1
    return SfinxBis().encode(word, max_length)
395
396
397
if __name__ == '__main__':
398
    import doctest
399
400
    doctest.testmod()
401